/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (hide annotations) (download) (as text)
Fri Oct 17 07:14:29 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.14: +95 -0 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	17 Oct 2008 07:14:01 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attlists-1.dat" added.

++ whatpm/t/xml/ChangeLog	17 Oct 2008 07:14:24 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat: New test data file.

	* doctypes-2.dat: New tests added.

++ whatpm/Whatpm/ChangeLog	17 Oct 2008 07:11:25 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (node_name): New attribute.
	(ELEMENT_TYPE_DEFINITION_NODE, ATTRIBUTE_DEFINITION_NODE): New
	constants.
	(create_element_type_definition_node, create_attribute_definition,
	create_notation, create_general_entity,
	get_element_type_definition_node,
	set_element_type_definition_node, get_general_entity_node,
	set_general_entity_node, get_notation_node, set_notation_node,
	get_attribute_definition_node, set_attribute_definition_node): New
	methods.
	(element_types, entities, notations, attribute_definitions): New
	attributes.
	(DocumentType): Support for child nodes, entities, notations, and
	element types.
	(Entity, Notation, ElementTypeDefinition, AttributeDefinition):
	New classes.

	* Dumper.pm: Support for general entities, notations, element type
	definitions, and attribute definitions.

++ whatpm/Whatpm/HTML/ChangeLog	17 Oct 2008 07:12:26 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token types AtTLIST_TOKEN, ELEMENT_TOKEN,
	GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and NOTATION_TOKEN
	are added.  New intertion modes for markup declarations are added.

++ whatpm/Whatpm/XML/ChangeLog	17 Oct 2008 07:13:47 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): Support for ELEMENT_TOKEN,
	ATTLIST_TOKEN, GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and
	NOTATION_TOKEN.

1 wakaba 1.1 package Whatpm::XML::Parser;
2     use strict;
3    
4     push our @ISA, 'Whatpm::HTML';
5     use Whatpm::HTML::Tokenizer qw/:token/;
6    
7     sub parse_char_string ($$$;$$) {
8     #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9     my $self = shift;
10     my $s = ref $_[0] ? $_[0] : \($_[0]);
11     require Whatpm::Charset::DecodeHandle;
12     my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13     return $self->parse_char_stream ($input, @_[1..$#_]);
14     } # parse_char_string
15    
16     sub parse_char_stream ($$$;$$) {
17     my $self = ref $_[0] ? shift : shift->new;
18     my $input = $_[0];
19     $self->{document} = $_[1];
20     @{$self->{document}->child_nodes} = ();
21    
22     ## NOTE: |set_inner_html| copies most of this method's code
23    
24     $self->{confident} = 1 unless exists $self->{confident};
25     $self->{document}->input_encoding ($self->{input_encoding})
26     if defined $self->{input_encoding};
27     ## TODO: |{input_encoding}| is needless?
28    
29     $self->{line_prev} = $self->{line} = 1;
30     $self->{column_prev} = -1;
31     $self->{column} = 0;
32     $self->{set_nc} = sub {
33     my $self = shift;
34    
35     my $char = '';
36     if (defined $self->{next_nc}) {
37     $char = $self->{next_nc};
38     delete $self->{next_nc};
39     $self->{nc} = ord $char;
40     } else {
41     $self->{char_buffer} = '';
42     $self->{char_buffer_pos} = 0;
43    
44     my $count = $input->manakai_read_until
45     ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46     if ($count) {
47     $self->{line_prev} = $self->{line};
48     $self->{column_prev} = $self->{column};
49     $self->{column}++;
50     $self->{nc}
51     = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52     return;
53     }
54    
55     if ($input->read ($char, 1)) {
56     $self->{nc} = ord $char;
57     } else {
58     $self->{nc} = -1;
59     return;
60     }
61     }
62    
63     ($self->{line_prev}, $self->{column_prev})
64     = ($self->{line}, $self->{column});
65     $self->{column}++;
66    
67     if ($self->{nc} == 0x000A) { # LF
68     !!!cp ('j1');
69     $self->{line}++;
70     $self->{column} = 0;
71     } elsif ($self->{nc} == 0x000D) { # CR
72     !!!cp ('j2');
73     ## TODO: support for abort/streaming
74     my $next = '';
75     if ($input->read ($next, 1) and $next ne "\x0A") {
76     $self->{next_nc} = $next;
77     }
78     $self->{nc} = 0x000A; # LF # MUST
79     $self->{line}++;
80     $self->{column} = 0;
81     } elsif ($self->{nc} == 0x0000) { # NULL
82     !!!cp ('j4');
83     !!!parse-error (type => 'NULL');
84     $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85     }
86     };
87    
88     $self->{read_until} = sub {
89     #my ($scalar, $specials_range, $offset) = @_;
90     return 0 if defined $self->{next_nc};
91    
92     my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93     my $offset = $_[2] || 0;
94    
95     if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96     pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97     if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98     substr ($_[0], $offset)
99     = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100     my $count = $+[0] - $-[0];
101     if ($count) {
102     $self->{column} += $count;
103     $self->{char_buffer_pos} += $count;
104     $self->{line_prev} = $self->{line};
105     $self->{column_prev} = $self->{column} - 1;
106     $self->{nc} = -1;
107     }
108     return $count;
109     } else {
110     return 0;
111     }
112     } else {
113     my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114     if ($count) {
115     $self->{column} += $count;
116     $self->{line_prev} = $self->{line};
117     $self->{column_prev} = $self->{column} - 1;
118     $self->{nc} = -1;
119     }
120     return $count;
121     }
122     }; # $self->{read_until}
123    
124     my $onerror = $_[2] || sub {
125     my (%opt) = @_;
126     my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127     my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128     warn "Parse error ($opt{type}) at line $line column $column\n";
129     };
130     $self->{parse_error} = sub {
131     $onerror->(line => $self->{line}, column => $self->{column}, @_);
132     };
133    
134     my $char_onerror = sub {
135     my (undef, $type, %opt) = @_;
136     !!!parse-error (layer => 'encode',
137     line => $self->{line}, column => $self->{column} + 1,
138     %opt, type => $type);
139     }; # $char_onerror
140    
141     if ($_[3]) {
142     $input = $_[3]->($input);
143     $input->onerror ($char_onerror);
144     } else {
145     $input->onerror ($char_onerror) unless defined $input->onerror;
146     }
147    
148     $self->_initialize_tokenizer;
149     $self->_initialize_tree_constructor;
150     $self->_construct_tree;
151     $self->_terminate_tree_constructor;
152    
153     delete $self->{parse_error}; # remove loop
154    
155     return $self->{document};
156     } # parse_char_stream
157    
158     sub new ($) {
159     my $class = shift;
160     my $self = bless {
161     level => {must => 'm',
162     should => 's',
163     warn => 'w',
164     info => 'i',
165     uncertain => 'u'},
166     }, $class;
167     $self->{set_nc} = sub {
168     $self->{nc} = -1;
169     };
170     $self->{parse_error} = sub {
171     #
172     };
173     $self->{change_encoding} = sub {
174     # if ($_[0] is a supported encoding) {
175     # run "change the encoding" algorithm;
176     # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177     # }
178     };
179     $self->{application_cache_selection} = sub {
180     #
181     };
182 wakaba 1.2
183     $self->{is_xml} = 1;
184    
185 wakaba 1.1 return $self;
186     } # new
187    
188     sub _initialize_tree_constructor ($) {
189     my $self = shift;
190     ## NOTE: $self->{document} MUST be specified before this method is called
191     $self->{document}->strict_error_checking (0);
192     ## TODO: Turn mutation events off # MUST
193     $self->{document}->dom_config
194     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195     = 0;
196     $self->{document}->manakai_is_html (0);
197     $self->{document}->set_user_data (manakai_source_line => 1);
198     $self->{document}->set_user_data (manakai_source_column => 1);
199     } # _initialize_tree_constructor
200    
201     sub _terminate_tree_constructor ($) {
202     my $self = shift;
203     $self->{document}->strict_error_checking (1);
204     $self->{document}->dom_config
205     ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206     = 1;
207     ## TODO: Turn mutation events on
208     } # _terminate_tree_constructor
209    
210     ## Tree construction stage
211    
212    
213     ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214    
215     ## XML5: No namespace support.
216    
217     ## XML5: Start, main, end phases. In this implementation, they are
218     ## represented by insertion modes.
219    
220     ## Insertion modes
221     sub INITIAL_IM () { 0 }
222     sub BEFORE_ROOT_ELEMENT_IM () { 1 }
223     sub IN_ELEMENT_IM () { 2 }
224     sub AFTER_ROOT_ELEMENT_IM () { 3 }
225 wakaba 1.14 sub IN_SUBSET_IM () { 4 }
226 wakaba 1.1
227     {
228     my $token; ## TODO: change to $self->{t}
229    
230     sub _construct_tree ($) {
231     my ($self) = @_;
232    
233     delete $self->{tainted};
234     $self->{open_elements} = [];
235     $self->{insertion_mode} = INITIAL_IM;
236 wakaba 1.8
237     !!!next-token;
238 wakaba 1.10
239     ## XML5: No support for the XML declaration
240     if ($token->{type} == PI_TOKEN and
241     $token->{target} eq 'xml' and
242     $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
243     (?>"([^"]*)"|'([^']*)')
244     (?:[\x09\x0A\x20]+
245     encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
246     (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
247     (?:[\x09\x0A\x20]+
248     standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
249     (?>"(yes|no)"|'(yes|no)'))?
250     [\x09\x0A\x20]*\z/x) {
251     $self->{document}->xml_version (defined $1 ? $1 : $2);
252     $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
253     $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
254    
255     !!!next-token;
256     } else {
257     $self->{document}->xml_version ('1.0');
258     $self->{document}->xml_encoding (undef);
259     $self->{document}->xml_standalone (0);
260     }
261 wakaba 1.1
262     while (1) {
263     if ($self->{insertion_mode} == IN_ELEMENT_IM) {
264     $self->_tree_in_element;
265 wakaba 1.14 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
266     $self->_tree_in_subset;
267 wakaba 1.1 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
268     $self->_tree_after_root_element;
269     } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
270     $self->_tree_before_root_element;
271     } elsif ($self->{insertion_mode} == INITIAL_IM) {
272     $self->_tree_initial;
273     } else {
274     die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
275     }
276    
277     last if $token->{type} == ABORT_TOKEN;
278     }
279     } # _construct_tree
280    
281     sub _tree_initial ($) {
282     my $self = shift;
283    
284     B: while (1) {
285     if ($token->{type} == DOCTYPE_TOKEN) {
286     ## XML5: No "DOCTYPE" token.
287    
288     my $doctype = $self->{document}->create_document_type_definition
289     (defined $token->{name} ? $token->{name} : '');
290    
291     ## NOTE: Default value for both |public_id| and |system_id| attributes
292     ## are empty strings, so that we don't set any value in missing cases.
293 wakaba 1.13 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
294     $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
295 wakaba 1.1
296     ## TODO: internal_subset
297    
298     $self->{document}->append_child ($doctype);
299 wakaba 1.14
300     ## XML5: No "has internal subset" flag.
301     if ($token->{has_internal_subset}) {
302     $self->{doctype} = $doctype;
303     $self->{insertion_mode} = IN_SUBSET_IM;
304     } else {
305     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
306     }
307 wakaba 1.1 !!!next-token;
308     return;
309     } elsif ($token->{type} == START_TAG_TOKEN or
310     $token->{type} == END_OF_FILE_TOKEN) {
311     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
312     ## Reprocess.
313     return;
314     } elsif ($token->{type} == COMMENT_TOKEN) {
315     my $comment = $self->{document}->create_comment ($token->{data});
316     $self->{document}->append_child ($comment);
317    
318     ## Stay in the mode.
319     !!!next-token;
320     next B;
321     } elsif ($token->{type} == PI_TOKEN) {
322     my $pi = $self->{document}->create_processing_instruction
323     ($token->{target}, $token->{data});
324     $self->{document}->append_child ($pi);
325    
326     ## Stay in the mode.
327     !!!next-token;
328     next B;
329     } elsif ($token->{type} == CHARACTER_TOKEN) {
330     if (not $self->{tainted} and
331 wakaba 1.9 not $token->{has_reference} and
332 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
333     #
334     }
335    
336     if (length $token->{data}) {
337     ## XML5: Ignore the token.
338    
339     unless ($self->{tainted}) {
340     !!!parse-error (type => 'text outside of root element',
341     token => $token);
342     $self->{tainted} = 1;
343     }
344    
345     $self->{document}->manakai_append_text ($token->{data});
346     }
347    
348     ## Stay in the mode.
349     !!!next-token;
350     next B;
351     } elsif ($token->{type} == END_TAG_TOKEN) {
352     !!!parse-error (type => 'unmatched end tag',
353     text => $token->{tag_name},
354     token => $token);
355     ## Ignore the token.
356    
357     ## Stay in the mode.
358     !!!next-token;
359     next B;
360     } elsif ($token->{type} == ABORT_TOKEN) {
361     return;
362     } else {
363     die "$0: XML parser initial: Unknown token type $token->{type}";
364     }
365     } # B
366     } # _tree_initial
367    
368     sub _tree_before_root_element ($) {
369     my $self = shift;
370    
371     B: while (1) {
372     if ($token->{type} == START_TAG_TOKEN) {
373 wakaba 1.6 my $nsmap = {
374     xml => q<http://www.w3.org/XML/1998/namespace>,
375     xmlns => q<http://www.w3.org/2000/xmlns/>,
376     };
377    
378     for (keys %{$token->{attributes}}) {
379     if (/^xmlns:./s) {
380     my $prefix = substr $_, 6;
381     my $value = $token->{attributes}->{$_}->{value};
382     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
383     $value eq q<http://www.w3.org/XML/1998/namespace> or
384     $value eq q<http://www.w3.org/2000/xmlns/>) {
385     ## NOTE: Error should be detected at the DOM layer.
386     #
387     } elsif (length $value) {
388     $nsmap->{$prefix} = $value;
389     } else {
390     delete $nsmap->{$prefix};
391     }
392     } elsif ($_ eq 'xmlns') {
393     my $value = $token->{attributes}->{$_}->{value};
394     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
395     $value eq q<http://www.w3.org/2000/xmlns/>) {
396     ## NOTE: Error should be detected at the DOM layer.
397     #
398     } elsif (length $value) {
399     $nsmap->{''} = $value;
400     } else {
401     delete $nsmap->{''};
402     }
403     }
404     }
405    
406     my $ns;
407 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
408 wakaba 1.6
409 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
410 wakaba 1.6 if (defined $nsmap->{$prefix}) {
411     $ns = $nsmap->{$prefix};
412     } else {
413     ($prefix, $ln) = (undef, $token->{tag_name});
414     }
415     } else {
416 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
417     ($prefix, $ln) = (undef, $token->{tag_name});
418 wakaba 1.6 }
419    
420 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
421 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
422     $el->set_user_data (manakai_source_column => $token->{column});
423    
424 wakaba 1.6 my $has_attr;
425 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
426     $token->{attributes}->{$b}->{index}}
427     keys %{$token->{attributes}}) {
428 wakaba 1.6 my $ns;
429 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
430 wakaba 1.6
431     if ($attr_name eq 'xmlns:xmlns') {
432     ($p, $l) = (undef, $attr_name);
433 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
434 wakaba 1.6 if (defined $nsmap->{$p}) {
435     $ns = $nsmap->{$p};
436     } else {
437     ## NOTE: Error should be detected at the DOM-layer.
438     ($p, $l) = (undef, $attr_name);
439     }
440     } else {
441 wakaba 1.11 if ($attr_name eq 'xmlns') {
442 wakaba 1.6 $ns = $nsmap->{xmlns};
443     }
444 wakaba 1.11 ($p, $l) = (undef, $attr_name);
445 wakaba 1.6 }
446    
447     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
448 wakaba 1.12 $ns = undef;
449     ($p, $l) = (undef, $attr_name);
450 wakaba 1.6 } else {
451     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
452     }
453    
454 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
455     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
456     $attr->value ($attr_t->{value});
457     $attr->set_user_data (manakai_source_line => $attr_t->{line});
458     $attr->set_user_data (manakai_source_column => $attr_t->{column});
459     $el->set_attribute_node_ns ($attr);
460     }
461    
462 wakaba 1.1 $self->{document}->append_child ($el);
463    
464 wakaba 1.3 if ($self->{self_closing}) {
465 wakaba 1.1 !!!ack ('ack');
466     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
467     } else {
468 wakaba 1.6 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
469 wakaba 1.1 $self->{insertion_mode} = IN_ELEMENT_IM;
470     }
471    
472     #delete $self->{tainted};
473    
474     !!!next-token;
475     return;
476     } elsif ($token->{type} == COMMENT_TOKEN) {
477     my $comment = $self->{document}->create_comment ($token->{data});
478     $self->{document}->append_child ($comment);
479    
480     ## Stay in the mode.
481     !!!next-token;
482     next B;
483     } elsif ($token->{type} == PI_TOKEN) {
484     my $pi = $self->{document}->create_processing_instruction
485     ($token->{target}, $token->{data});
486     $self->{document}->append_child ($pi);
487    
488     ## Stay in the mode.
489     !!!next-token;
490     next B;
491     } elsif ($token->{type} == CHARACTER_TOKEN) {
492     if (not $self->{tainted} and
493 wakaba 1.9 not $token->{has_reference} and
494 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
495     #
496     }
497    
498     if (length $token->{data}) {
499     ## XML5: Ignore the token.
500    
501     unless ($self->{tainted}) {
502     !!!parse-error (type => 'text outside of root element',
503     token => $token);
504     $self->{tainted} = 1;
505     }
506    
507     $self->{document}->manakai_append_text ($token->{data});
508     }
509    
510     ## Stay in the mode.
511     !!!next-token;
512     next B;
513     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
514     !!!parse-error (type => 'no root element',
515     token => $token);
516    
517     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
518     ## Reprocess.
519     return;
520     } elsif ($token->{type} == END_TAG_TOKEN) {
521     !!!parse-error (type => 'unmatched end tag',
522     text => $token->{tag_name},
523     token => $token);
524     ## Ignore the token.
525    
526     ## Stay in the mode.
527     !!!next-token;
528     next B;
529     } elsif ($token->{type} == DOCTYPE_TOKEN) {
530     !!!parse-error (type => 'in html:#doctype',
531     token => $token);
532     ## Ignore the token.
533    
534     ## Stay in the mode.
535     !!!next-token;
536     next B;
537     } elsif ($token->{type} == ABORT_TOKEN) {
538     return;
539     } else {
540     die "$0: XML parser initial: Unknown token type $token->{type}";
541     }
542     } # B
543     } # _tree_before_root_element
544    
545     sub _tree_in_element ($) {
546     my $self = shift;
547    
548     B: while (1) {
549     if ($token->{type} == CHARACTER_TOKEN) {
550     $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
551    
552     ## Stay in the mode.
553     !!!next-token;
554     next B;
555     } elsif ($token->{type} == START_TAG_TOKEN) {
556 wakaba 1.7 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
557    
558     for (keys %{$token->{attributes}}) {
559     if (/^xmlns:./s) {
560     my $prefix = substr $_, 6;
561     my $value = $token->{attributes}->{$_}->{value};
562     if ($prefix eq 'xml' or $prefix eq 'xmlns' or
563     $value eq q<http://www.w3.org/XML/1998/namespace> or
564     $value eq q<http://www.w3.org/2000/xmlns/>) {
565     ## NOTE: Error should be detected at the DOM layer.
566     #
567     } elsif (length $value) {
568     $nsmap->{$prefix} = $value;
569     } else {
570     delete $nsmap->{$prefix};
571     }
572     } elsif ($_ eq 'xmlns') {
573     my $value = $token->{attributes}->{$_}->{value};
574     if ($value eq q<http://www.w3.org/XML/1998/namespace> or
575     $value eq q<http://www.w3.org/2000/xmlns/>) {
576     ## NOTE: Error should be detected at the DOM layer.
577     #
578     } elsif (length $value) {
579     $nsmap->{''} = $value;
580     } else {
581     delete $nsmap->{''};
582     }
583     }
584     }
585    
586     my $ns;
587 wakaba 1.3 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
588 wakaba 1.7
589 wakaba 1.11 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
590 wakaba 1.7 if (defined $nsmap->{$prefix}) {
591     $ns = $nsmap->{$prefix};
592     } else {
593     ## NOTE: Error should be detected at the DOM layer.
594     ($prefix, $ln) = (undef, $token->{tag_name});
595     }
596     } else {
597 wakaba 1.11 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
598     ($prefix, $ln) = (undef, $token->{tag_name});
599 wakaba 1.7 }
600    
601 wakaba 1.3 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
602 wakaba 1.4 $el->set_user_data (manakai_source_line => $token->{line});
603     $el->set_user_data (manakai_source_column => $token->{column});
604    
605 wakaba 1.7 my $has_attr;
606 wakaba 1.12 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
607     $token->{attributes}->{$b}->{index}}
608     keys %{$token->{attributes}}) {
609 wakaba 1.7 my $ns;
610 wakaba 1.4 my ($p, $l) = split /:/, $attr_name, 2;
611 wakaba 1.7
612     if ($attr_name eq 'xmlns:xmlns') {
613     ($p, $l) = (undef, $attr_name);
614 wakaba 1.11 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
615 wakaba 1.7 if (defined $nsmap->{$p}) {
616     $ns = $nsmap->{$p};
617     } else {
618     ## NOTE: Error should be detected at the DOM-layer.
619     ($p, $l) = (undef, $attr_name);
620     }
621     } else {
622 wakaba 1.11 if ($attr_name eq 'xmlns') {
623 wakaba 1.7 $ns = $nsmap->{xmlns};
624     }
625 wakaba 1.11 ($p, $l) = (undef, $attr_name);
626 wakaba 1.7 }
627    
628     if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
629 wakaba 1.12 $ns = undef;
630     ($p, $l) = (undef, $attr_name);
631 wakaba 1.7 } else {
632     $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
633     }
634    
635 wakaba 1.4 my $attr_t = $token->{attributes}->{$attr_name};
636     my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
637     $attr->value ($attr_t->{value});
638     $attr->set_user_data (manakai_source_line => $attr_t->{line});
639     $attr->set_user_data (manakai_source_column => $attr_t->{column});
640     $el->set_attribute_node_ns ($attr);
641     }
642    
643 wakaba 1.1 $self->{open_elements}->[-1]->[0]->append_child ($el);
644    
645 wakaba 1.3 if ($self->{self_closing}) {
646 wakaba 1.1 !!!ack ('ack');
647     } else {
648 wakaba 1.7 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
649 wakaba 1.1 }
650    
651     ## Stay in the mode.
652     !!!next-token;
653     next B;
654     } elsif ($token->{type} == END_TAG_TOKEN) {
655 wakaba 1.2 if ($token->{tag_name} eq '') {
656 wakaba 1.1 ## Short end tag token.
657     pop @{$self->{open_elements}};
658     } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
659     pop @{$self->{open_elements}};
660     } else {
661     !!!parse-error (type => 'unmatched end tag',
662     text => $token->{tag_name},
663     token => $token);
664    
665     ## Has an element in scope
666 wakaba 1.2 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
667 wakaba 1.1 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
668     splice @{$self->{open_elements}}, $i;
669     last INSCOPE;
670     }
671     } # INSCOPE
672     }
673    
674     unless (@{$self->{open_elements}}) {
675     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
676     !!!next-token;
677     return;
678     } else {
679     ## Stay in the state.
680     !!!next-token;
681     redo B;
682     }
683     } elsif ($token->{type} == COMMENT_TOKEN) {
684     my $comment = $self->{document}->create_comment ($token->{data});
685     $self->{open_elements}->[-1]->[0]->append_child ($comment);
686    
687     ## Stay in the mode.
688     !!!next-token;
689     next B;
690     } elsif ($token->{type} == PI_TOKEN) {
691     my $pi = $self->{document}->create_processing_instruction
692     ($token->{target}, $token->{data});
693     $self->{open_elements}->[-1]->[0]->append_child ($pi);
694    
695     ## Stay in the mode.
696     !!!next-token;
697     next B;
698     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
699     !!!parse-error (type => 'in body:#eof',
700     token => $token);
701    
702     $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
703     !!!next-token;
704     return;
705     } elsif ($token->{type} == DOCTYPE_TOKEN) {
706     !!!parse-error (type => 'in html:#doctype',
707     token => $token);
708     ## Ignore the token.
709    
710     ## Stay in the mode.
711     !!!next-token;
712     next B;
713     } elsif ($token->{type} == ABORT_TOKEN) {
714     return;
715     } else {
716     die "$0: XML parser initial: Unknown token type $token->{type}";
717     }
718     } # B
719     } # _tree_in_element
720    
721     sub _tree_after_root_element ($) {
722     my $self = shift;
723    
724     B: while (1) {
725     if ($token->{type} == START_TAG_TOKEN) {
726     !!!parse-error (type => 'second root element',
727     token => $token);
728    
729     ## XML5: Ignore the token.
730 wakaba 1.4
731 wakaba 1.5 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
732     ## Reprocess.
733 wakaba 1.1 return;
734     } elsif ($token->{type} == COMMENT_TOKEN) {
735     my $comment = $self->{document}->create_comment ($token->{data});
736     $self->{document}->append_child ($comment);
737    
738     ## Stay in the mode.
739     !!!next-token;
740     next B;
741     } elsif ($token->{type} == PI_TOKEN) {
742     my $pi = $self->{document}->create_processing_instruction
743     ($token->{target}, $token->{data});
744     $self->{document}->append_child ($pi);
745    
746     ## Stay in the mode.
747     !!!next-token;
748     next B;
749     } elsif ($token->{type} == CHARACTER_TOKEN) {
750     if (not $self->{tainted} and
751 wakaba 1.9 not $token->{has_reference} and
752 wakaba 1.1 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
753     #
754     }
755    
756     if (length $token->{data}) {
757     ## XML5: Ignore the token.
758    
759     unless ($self->{tainted}) {
760     !!!parse-error (type => 'text outside of root element',
761     token => $token);
762     $self->{tainted} = 1;
763     }
764    
765     $self->{document}->manakai_append_text ($token->{data});
766     }
767    
768     ## Stay in the mode.
769     !!!next-token;
770     next B;
771     } elsif ($token->{type} == END_OF_FILE_TOKEN) {
772     ## Stop parsing.
773    
774     ## TODO: implement "stop parsing".
775    
776     $token = {type => ABORT_TOKEN};
777     return;
778     } elsif ($token->{type} == END_TAG_TOKEN) {
779     !!!parse-error (type => 'unmatched end tag',
780     text => $token->{tag_name},
781     token => $token);
782     ## Ignore the token.
783    
784     ## Stay in the mode.
785     !!!next-token;
786     next B;
787     } elsif ($token->{type} == DOCTYPE_TOKEN) {
788     !!!parse-error (type => 'in html:#doctype',
789     token => $token);
790     ## Ignore the token.
791    
792     ## Stay in the mode.
793     !!!next-token;
794     next B;
795     } elsif ($token->{type} == ABORT_TOKEN) {
796     return;
797     } else {
798     die "$0: XML parser initial: Unknown token type $token->{type}";
799     }
800     } # B
801     } # _tree_after_root_element
802 wakaba 1.14
803     sub _tree_in_subset ($) {
804     my $self = shift;
805    
806     B: while (1) {
807     if ($token->{type} == COMMENT_TOKEN) {
808     ## Ignore the token.
809    
810     ## Stay in the state.
811     !!!next-token;
812     next B;
813 wakaba 1.15 } elsif ($token->{type} == ELEMENT_TOKEN) {
814     unless ($self->{doctype}->get_element_type_definition_node
815     ($token->{name})) {
816     my $node = $self->{document}->create_element_type_definition
817     ($token->{name});
818     $node->set_user_data (manakai_source_line => $token->{line});
819     $node->set_user_data (manakai_source_column => $token->{column});
820    
821     ## TODO: ...
822    
823     $self->{doctype}->set_element_type_definition_node ($node);
824     } else {
825     ## TODO: ...
826    
827     }
828    
829     ## Stay in the mode.
830     !!!next-token;
831     next B;
832     } elsif ($token->{type} == ATTLIST_TOKEN) {
833     my $ed = $self->{doctype}->get_element_type_definition_node
834     ($token->{name});
835     unless ($ed) {
836     $ed = $self->{document}->create_element_type_definition
837     ($token->{name});
838     $ed->set_user_data (manakai_source_line => $token->{line});
839     $ed->set_user_data (manakai_source_column => $token->{column});
840     $self->{doctype}->set_element_type_definition_node ($ed);
841     }
842    
843     =pod
844    
845     unless ($ed->get_attribute_definition_node ($token->{name})) {
846     my $node = $self->{document}->create_attribute_definition
847     ($token->{name});
848     $node->set_user_data (manakai_source_line => $token->{line});
849     $node->set_user_data (manakai_source_column => $token->{column});
850    
851     ## TODO: ...
852    
853     $ed->set_attribute_definition_node ($node);
854     } else {
855     ## TODO: ...
856    
857     }
858    
859     =cut
860    
861     ## Stay in the mode.
862     !!!next-token;
863     next B;
864     } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
865     ## TODO: Creates a node only if the token is an external entity.
866    
867     unless ($self->{doctype}->get_general_entity_node
868     ($token->{name})) {
869     my $node = $self->{document}->create_general_entity ($token->{name});
870     $node->set_user_data (manakai_source_line => $token->{line});
871     $node->set_user_data (manakai_source_column => $token->{column});
872    
873     ## TODO: ...
874    
875     $self->{doctype}->set_general_entity_node ($node);
876     } else {
877     ## TODO: ...
878    
879     }
880    
881     ## Stay in the mode.
882     !!!next-token;
883     next B;
884     } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
885     ## TODO: ...
886    
887     ## Stay in the mode.
888     !!!next-token;
889     next B;
890     } elsif ($token->{type} == NOTATION_TOKEN) {
891     unless ($self->{doctype}->get_notation_node
892     ($token->{name})) {
893     my $node = $self->{document}->create_notation ($token->{name});
894     $node->set_user_data (manakai_source_line => $token->{line});
895     $node->set_user_data (manakai_source_column => $token->{column});
896    
897     ## TODO: ...
898    
899     $self->{doctype}->set_notation_node ($node);
900     } else {
901     ## TODO: ...
902    
903     }
904    
905     ## Stay in the mode.
906     !!!next-token;
907     next B;
908 wakaba 1.14 } elsif ($token->{type} == PI_TOKEN) {
909     my $pi = $self->{document}->create_processing_instruction
910     ($token->{target}, $token->{data});
911     $self->{doctype}->append_child ($pi);
912     ## TODO: line/col
913    
914     ## Stay in the mode.
915     !!!next-token;
916     next B;
917     } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
918     $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
919     !!!next-token;
920     return;
921     } elsif ($token->{type} == ABORT_TOKEN) {
922     return;
923     } else {
924     die "$0: XML parser subset im: Unknown token type $token->{type}";
925     }
926     } # B
927    
928     } # _tree_in_subset
929 wakaba 1.1
930     }
931    
932     1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24