/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (show annotations) (download)
Fri Oct 17 07:14:29 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.14: +95 -0 lines
++ whatpm/t/ChangeLog	17 Oct 2008 07:14:01 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attlists-1.dat" added.

++ whatpm/t/xml/ChangeLog	17 Oct 2008 07:14:24 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* attlists-1.dat: New test data file.

	* doctypes-2.dat: New tests added.

++ whatpm/Whatpm/ChangeLog	17 Oct 2008 07:11:25 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (node_name): New attribute.
	(ELEMENT_TYPE_DEFINITION_NODE, ATTRIBUTE_DEFINITION_NODE): New
	constants.
	(create_element_type_definition_node, create_attribute_definition,
	create_notation, create_general_entity,
	get_element_type_definition_node,
	set_element_type_definition_node, get_general_entity_node,
	set_general_entity_node, get_notation_node, set_notation_node,
	get_attribute_definition_node, set_attribute_definition_node): New
	methods.
	(element_types, entities, notations, attribute_definitions): New
	attributes.
	(DocumentType): Support for child nodes, entities, notations, and
	element types.
	(Entity, Notation, ElementTypeDefinition, AttributeDefinition):
	New classes.

	* Dumper.pm: Support for general entities, notations, element type
	definitions, and attribute definitions.

++ whatpm/Whatpm/HTML/ChangeLog	17 Oct 2008 07:12:26 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Tokenizer.pm.src: New token types AtTLIST_TOKEN, ELEMENT_TOKEN,
	GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and NOTATION_TOKEN
	are added.  New intertion modes for markup declarations are added.

++ whatpm/Whatpm/XML/ChangeLog	17 Oct 2008 07:13:47 -0000
2008-10-17  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src (_tree_in_subset): Support for ELEMENT_TOKEN,
	ATTLIST_TOKEN, GENERAL_ENTITY_TOKEN, PARAMETER_ENTITY_TOKEN, and
	NOTATION_TOKEN.

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82
83 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199 } # _initialize_tree_constructor
200
201 sub _terminate_tree_constructor ($) {
202 my $self = shift;
203 $self->{document}->strict_error_checking (1);
204 $self->{document}->dom_config
205 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206 = 1;
207 ## TODO: Turn mutation events on
208 } # _terminate_tree_constructor
209
210 ## Tree construction stage
211
212
213 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214
215 ## XML5: No namespace support.
216
217 ## XML5: Start, main, end phases. In this implementation, they are
218 ## represented by insertion modes.
219
220 ## Insertion modes
221 sub INITIAL_IM () { 0 }
222 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
223 sub IN_ELEMENT_IM () { 2 }
224 sub AFTER_ROOT_ELEMENT_IM () { 3 }
225 sub IN_SUBSET_IM () { 4 }
226
227 {
228 my $token; ## TODO: change to $self->{t}
229
230 sub _construct_tree ($) {
231 my ($self) = @_;
232
233 delete $self->{tainted};
234 $self->{open_elements} = [];
235 $self->{insertion_mode} = INITIAL_IM;
236
237 $token = $self->_get_next_token;
238
239 ## XML5: No support for the XML declaration
240 if ($token->{type} == PI_TOKEN and
241 $token->{target} eq 'xml' and
242 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
243 (?>"([^"]*)"|'([^']*)')
244 (?:[\x09\x0A\x20]+
245 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
246 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
247 (?:[\x09\x0A\x20]+
248 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
249 (?>"(yes|no)"|'(yes|no)'))?
250 [\x09\x0A\x20]*\z/x) {
251 $self->{document}->xml_version (defined $1 ? $1 : $2);
252 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
253 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
254
255 $token = $self->_get_next_token;
256 } else {
257 $self->{document}->xml_version ('1.0');
258 $self->{document}->xml_encoding (undef);
259 $self->{document}->xml_standalone (0);
260 }
261
262 while (1) {
263 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
264 $self->_tree_in_element;
265 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
266 $self->_tree_in_subset;
267 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
268 $self->_tree_after_root_element;
269 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
270 $self->_tree_before_root_element;
271 } elsif ($self->{insertion_mode} == INITIAL_IM) {
272 $self->_tree_initial;
273 } else {
274 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
275 }
276
277 last if $token->{type} == ABORT_TOKEN;
278 }
279 } # _construct_tree
280
281 sub _tree_initial ($) {
282 my $self = shift;
283
284 B: while (1) {
285 if ($token->{type} == DOCTYPE_TOKEN) {
286 ## XML5: No "DOCTYPE" token.
287
288 my $doctype = $self->{document}->create_document_type_definition
289 (defined $token->{name} ? $token->{name} : '');
290
291 ## NOTE: Default value for both |public_id| and |system_id| attributes
292 ## are empty strings, so that we don't set any value in missing cases.
293 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
294 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
295
296 ## TODO: internal_subset
297
298 $self->{document}->append_child ($doctype);
299
300 ## XML5: No "has internal subset" flag.
301 if ($token->{has_internal_subset}) {
302 $self->{doctype} = $doctype;
303 $self->{insertion_mode} = IN_SUBSET_IM;
304 } else {
305 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
306 }
307 $token = $self->_get_next_token;
308 return;
309 } elsif ($token->{type} == START_TAG_TOKEN or
310 $token->{type} == END_OF_FILE_TOKEN) {
311 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
312 ## Reprocess.
313 return;
314 } elsif ($token->{type} == COMMENT_TOKEN) {
315 my $comment = $self->{document}->create_comment ($token->{data});
316 $self->{document}->append_child ($comment);
317
318 ## Stay in the mode.
319 $token = $self->_get_next_token;
320 next B;
321 } elsif ($token->{type} == PI_TOKEN) {
322 my $pi = $self->{document}->create_processing_instruction
323 ($token->{target}, $token->{data});
324 $self->{document}->append_child ($pi);
325
326 ## Stay in the mode.
327 $token = $self->_get_next_token;
328 next B;
329 } elsif ($token->{type} == CHARACTER_TOKEN) {
330 if (not $self->{tainted} and
331 not $token->{has_reference} and
332 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
333 #
334 }
335
336 if (length $token->{data}) {
337 ## XML5: Ignore the token.
338
339 unless ($self->{tainted}) {
340 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
341 token => $token);
342 $self->{tainted} = 1;
343 }
344
345 $self->{document}->manakai_append_text ($token->{data});
346 }
347
348 ## Stay in the mode.
349 $token = $self->_get_next_token;
350 next B;
351 } elsif ($token->{type} == END_TAG_TOKEN) {
352 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
353 text => $token->{tag_name},
354 token => $token);
355 ## Ignore the token.
356
357 ## Stay in the mode.
358 $token = $self->_get_next_token;
359 next B;
360 } elsif ($token->{type} == ABORT_TOKEN) {
361 return;
362 } else {
363 die "$0: XML parser initial: Unknown token type $token->{type}";
364 }
365 } # B
366 } # _tree_initial
367
368 sub _tree_before_root_element ($) {
369 my $self = shift;
370
371 B: while (1) {
372 if ($token->{type} == START_TAG_TOKEN) {
373 my $nsmap = {
374 xml => q<http://www.w3.org/XML/1998/namespace>,
375 xmlns => q<http://www.w3.org/2000/xmlns/>,
376 };
377
378 for (keys %{$token->{attributes}}) {
379 if (/^xmlns:./s) {
380 my $prefix = substr $_, 6;
381 my $value = $token->{attributes}->{$_}->{value};
382 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
383 $value eq q<http://www.w3.org/XML/1998/namespace> or
384 $value eq q<http://www.w3.org/2000/xmlns/>) {
385 ## NOTE: Error should be detected at the DOM layer.
386 #
387 } elsif (length $value) {
388 $nsmap->{$prefix} = $value;
389 } else {
390 delete $nsmap->{$prefix};
391 }
392 } elsif ($_ eq 'xmlns') {
393 my $value = $token->{attributes}->{$_}->{value};
394 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
395 $value eq q<http://www.w3.org/2000/xmlns/>) {
396 ## NOTE: Error should be detected at the DOM layer.
397 #
398 } elsif (length $value) {
399 $nsmap->{''} = $value;
400 } else {
401 delete $nsmap->{''};
402 }
403 }
404 }
405
406 my $ns;
407 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
408
409 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
410 if (defined $nsmap->{$prefix}) {
411 $ns = $nsmap->{$prefix};
412 } else {
413 ($prefix, $ln) = (undef, $token->{tag_name});
414 }
415 } else {
416 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
417 ($prefix, $ln) = (undef, $token->{tag_name});
418 }
419
420 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
421 $el->set_user_data (manakai_source_line => $token->{line});
422 $el->set_user_data (manakai_source_column => $token->{column});
423
424 my $has_attr;
425 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
426 $token->{attributes}->{$b}->{index}}
427 keys %{$token->{attributes}}) {
428 my $ns;
429 my ($p, $l) = split /:/, $attr_name, 2;
430
431 if ($attr_name eq 'xmlns:xmlns') {
432 ($p, $l) = (undef, $attr_name);
433 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
434 if (defined $nsmap->{$p}) {
435 $ns = $nsmap->{$p};
436 } else {
437 ## NOTE: Error should be detected at the DOM-layer.
438 ($p, $l) = (undef, $attr_name);
439 }
440 } else {
441 if ($attr_name eq 'xmlns') {
442 $ns = $nsmap->{xmlns};
443 }
444 ($p, $l) = (undef, $attr_name);
445 }
446
447 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
448 $ns = undef;
449 ($p, $l) = (undef, $attr_name);
450 } else {
451 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
452 }
453
454 my $attr_t = $token->{attributes}->{$attr_name};
455 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
456 $attr->value ($attr_t->{value});
457 $attr->set_user_data (manakai_source_line => $attr_t->{line});
458 $attr->set_user_data (manakai_source_column => $attr_t->{column});
459 $el->set_attribute_node_ns ($attr);
460 }
461
462 $self->{document}->append_child ($el);
463
464 if ($self->{self_closing}) {
465 delete $self->{self_closing};
466 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
467 } else {
468 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
469 $self->{insertion_mode} = IN_ELEMENT_IM;
470 }
471
472 #delete $self->{tainted};
473
474 $token = $self->_get_next_token;
475 return;
476 } elsif ($token->{type} == COMMENT_TOKEN) {
477 my $comment = $self->{document}->create_comment ($token->{data});
478 $self->{document}->append_child ($comment);
479
480 ## Stay in the mode.
481 $token = $self->_get_next_token;
482 next B;
483 } elsif ($token->{type} == PI_TOKEN) {
484 my $pi = $self->{document}->create_processing_instruction
485 ($token->{target}, $token->{data});
486 $self->{document}->append_child ($pi);
487
488 ## Stay in the mode.
489 $token = $self->_get_next_token;
490 next B;
491 } elsif ($token->{type} == CHARACTER_TOKEN) {
492 if (not $self->{tainted} and
493 not $token->{has_reference} and
494 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
495 #
496 }
497
498 if (length $token->{data}) {
499 ## XML5: Ignore the token.
500
501 unless ($self->{tainted}) {
502 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
503 token => $token);
504 $self->{tainted} = 1;
505 }
506
507 $self->{document}->manakai_append_text ($token->{data});
508 }
509
510 ## Stay in the mode.
511 $token = $self->_get_next_token;
512 next B;
513 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
514 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
515 token => $token);
516
517 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
518 ## Reprocess.
519 return;
520 } elsif ($token->{type} == END_TAG_TOKEN) {
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
522 text => $token->{tag_name},
523 token => $token);
524 ## Ignore the token.
525
526 ## Stay in the mode.
527 $token = $self->_get_next_token;
528 next B;
529 } elsif ($token->{type} == DOCTYPE_TOKEN) {
530 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
531 token => $token);
532 ## Ignore the token.
533
534 ## Stay in the mode.
535 $token = $self->_get_next_token;
536 next B;
537 } elsif ($token->{type} == ABORT_TOKEN) {
538 return;
539 } else {
540 die "$0: XML parser initial: Unknown token type $token->{type}";
541 }
542 } # B
543 } # _tree_before_root_element
544
545 sub _tree_in_element ($) {
546 my $self = shift;
547
548 B: while (1) {
549 if ($token->{type} == CHARACTER_TOKEN) {
550 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
551
552 ## Stay in the mode.
553 $token = $self->_get_next_token;
554 next B;
555 } elsif ($token->{type} == START_TAG_TOKEN) {
556 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
557
558 for (keys %{$token->{attributes}}) {
559 if (/^xmlns:./s) {
560 my $prefix = substr $_, 6;
561 my $value = $token->{attributes}->{$_}->{value};
562 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
563 $value eq q<http://www.w3.org/XML/1998/namespace> or
564 $value eq q<http://www.w3.org/2000/xmlns/>) {
565 ## NOTE: Error should be detected at the DOM layer.
566 #
567 } elsif (length $value) {
568 $nsmap->{$prefix} = $value;
569 } else {
570 delete $nsmap->{$prefix};
571 }
572 } elsif ($_ eq 'xmlns') {
573 my $value = $token->{attributes}->{$_}->{value};
574 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
575 $value eq q<http://www.w3.org/2000/xmlns/>) {
576 ## NOTE: Error should be detected at the DOM layer.
577 #
578 } elsif (length $value) {
579 $nsmap->{''} = $value;
580 } else {
581 delete $nsmap->{''};
582 }
583 }
584 }
585
586 my $ns;
587 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
588
589 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
590 if (defined $nsmap->{$prefix}) {
591 $ns = $nsmap->{$prefix};
592 } else {
593 ## NOTE: Error should be detected at the DOM layer.
594 ($prefix, $ln) = (undef, $token->{tag_name});
595 }
596 } else {
597 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
598 ($prefix, $ln) = (undef, $token->{tag_name});
599 }
600
601 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
602 $el->set_user_data (manakai_source_line => $token->{line});
603 $el->set_user_data (manakai_source_column => $token->{column});
604
605 my $has_attr;
606 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
607 $token->{attributes}->{$b}->{index}}
608 keys %{$token->{attributes}}) {
609 my $ns;
610 my ($p, $l) = split /:/, $attr_name, 2;
611
612 if ($attr_name eq 'xmlns:xmlns') {
613 ($p, $l) = (undef, $attr_name);
614 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
615 if (defined $nsmap->{$p}) {
616 $ns = $nsmap->{$p};
617 } else {
618 ## NOTE: Error should be detected at the DOM-layer.
619 ($p, $l) = (undef, $attr_name);
620 }
621 } else {
622 if ($attr_name eq 'xmlns') {
623 $ns = $nsmap->{xmlns};
624 }
625 ($p, $l) = (undef, $attr_name);
626 }
627
628 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
629 $ns = undef;
630 ($p, $l) = (undef, $attr_name);
631 } else {
632 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
633 }
634
635 my $attr_t = $token->{attributes}->{$attr_name};
636 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
637 $attr->value ($attr_t->{value});
638 $attr->set_user_data (manakai_source_line => $attr_t->{line});
639 $attr->set_user_data (manakai_source_column => $attr_t->{column});
640 $el->set_attribute_node_ns ($attr);
641 }
642
643 $self->{open_elements}->[-1]->[0]->append_child ($el);
644
645 if ($self->{self_closing}) {
646 delete $self->{self_closing};
647 } else {
648 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
649 }
650
651 ## Stay in the mode.
652 $token = $self->_get_next_token;
653 next B;
654 } elsif ($token->{type} == END_TAG_TOKEN) {
655 if ($token->{tag_name} eq '') {
656 ## Short end tag token.
657 pop @{$self->{open_elements}};
658 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
659 pop @{$self->{open_elements}};
660 } else {
661 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
662 text => $token->{tag_name},
663 token => $token);
664
665 ## Has an element in scope
666 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
667 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
668 splice @{$self->{open_elements}}, $i;
669 last INSCOPE;
670 }
671 } # INSCOPE
672 }
673
674 unless (@{$self->{open_elements}}) {
675 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
676 $token = $self->_get_next_token;
677 return;
678 } else {
679 ## Stay in the state.
680 $token = $self->_get_next_token;
681 redo B;
682 }
683 } elsif ($token->{type} == COMMENT_TOKEN) {
684 my $comment = $self->{document}->create_comment ($token->{data});
685 $self->{open_elements}->[-1]->[0]->append_child ($comment);
686
687 ## Stay in the mode.
688 $token = $self->_get_next_token;
689 next B;
690 } elsif ($token->{type} == PI_TOKEN) {
691 my $pi = $self->{document}->create_processing_instruction
692 ($token->{target}, $token->{data});
693 $self->{open_elements}->[-1]->[0]->append_child ($pi);
694
695 ## Stay in the mode.
696 $token = $self->_get_next_token;
697 next B;
698 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
699 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
700 token => $token);
701
702 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
703 $token = $self->_get_next_token;
704 return;
705 } elsif ($token->{type} == DOCTYPE_TOKEN) {
706 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
707 token => $token);
708 ## Ignore the token.
709
710 ## Stay in the mode.
711 $token = $self->_get_next_token;
712 next B;
713 } elsif ($token->{type} == ABORT_TOKEN) {
714 return;
715 } else {
716 die "$0: XML parser initial: Unknown token type $token->{type}";
717 }
718 } # B
719 } # _tree_in_element
720
721 sub _tree_after_root_element ($) {
722 my $self = shift;
723
724 B: while (1) {
725 if ($token->{type} == START_TAG_TOKEN) {
726 $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
727 token => $token);
728
729 ## XML5: Ignore the token.
730
731 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
732 ## Reprocess.
733 return;
734 } elsif ($token->{type} == COMMENT_TOKEN) {
735 my $comment = $self->{document}->create_comment ($token->{data});
736 $self->{document}->append_child ($comment);
737
738 ## Stay in the mode.
739 $token = $self->_get_next_token;
740 next B;
741 } elsif ($token->{type} == PI_TOKEN) {
742 my $pi = $self->{document}->create_processing_instruction
743 ($token->{target}, $token->{data});
744 $self->{document}->append_child ($pi);
745
746 ## Stay in the mode.
747 $token = $self->_get_next_token;
748 next B;
749 } elsif ($token->{type} == CHARACTER_TOKEN) {
750 if (not $self->{tainted} and
751 not $token->{has_reference} and
752 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
753 #
754 }
755
756 if (length $token->{data}) {
757 ## XML5: Ignore the token.
758
759 unless ($self->{tainted}) {
760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
761 token => $token);
762 $self->{tainted} = 1;
763 }
764
765 $self->{document}->manakai_append_text ($token->{data});
766 }
767
768 ## Stay in the mode.
769 $token = $self->_get_next_token;
770 next B;
771 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
772 ## Stop parsing.
773
774 ## TODO: implement "stop parsing".
775
776 $token = {type => ABORT_TOKEN};
777 return;
778 } elsif ($token->{type} == END_TAG_TOKEN) {
779 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
780 text => $token->{tag_name},
781 token => $token);
782 ## Ignore the token.
783
784 ## Stay in the mode.
785 $token = $self->_get_next_token;
786 next B;
787 } elsif ($token->{type} == DOCTYPE_TOKEN) {
788 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
789 token => $token);
790 ## Ignore the token.
791
792 ## Stay in the mode.
793 $token = $self->_get_next_token;
794 next B;
795 } elsif ($token->{type} == ABORT_TOKEN) {
796 return;
797 } else {
798 die "$0: XML parser initial: Unknown token type $token->{type}";
799 }
800 } # B
801 } # _tree_after_root_element
802
803 sub _tree_in_subset ($) {
804 my $self = shift;
805
806 B: while (1) {
807 if ($token->{type} == COMMENT_TOKEN) {
808 ## Ignore the token.
809
810 ## Stay in the state.
811 $token = $self->_get_next_token;
812 next B;
813 } elsif ($token->{type} == ELEMENT_TOKEN) {
814 unless ($self->{doctype}->get_element_type_definition_node
815 ($token->{name})) {
816 my $node = $self->{document}->create_element_type_definition
817 ($token->{name});
818 $node->set_user_data (manakai_source_line => $token->{line});
819 $node->set_user_data (manakai_source_column => $token->{column});
820
821 ## TODO: ...
822
823 $self->{doctype}->set_element_type_definition_node ($node);
824 } else {
825 ## TODO: ...
826
827 }
828
829 ## Stay in the mode.
830 $token = $self->_get_next_token;
831 next B;
832 } elsif ($token->{type} == ATTLIST_TOKEN) {
833 my $ed = $self->{doctype}->get_element_type_definition_node
834 ($token->{name});
835 unless ($ed) {
836 $ed = $self->{document}->create_element_type_definition
837 ($token->{name});
838 $ed->set_user_data (manakai_source_line => $token->{line});
839 $ed->set_user_data (manakai_source_column => $token->{column});
840 $self->{doctype}->set_element_type_definition_node ($ed);
841 }
842
843 =pod
844
845 unless ($ed->get_attribute_definition_node ($token->{name})) {
846 my $node = $self->{document}->create_attribute_definition
847 ($token->{name});
848 $node->set_user_data (manakai_source_line => $token->{line});
849 $node->set_user_data (manakai_source_column => $token->{column});
850
851 ## TODO: ...
852
853 $ed->set_attribute_definition_node ($node);
854 } else {
855 ## TODO: ...
856
857 }
858
859 =cut
860
861 ## Stay in the mode.
862 $token = $self->_get_next_token;
863 next B;
864 } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
865 ## TODO: Creates a node only if the token is an external entity.
866
867 unless ($self->{doctype}->get_general_entity_node
868 ($token->{name})) {
869 my $node = $self->{document}->create_general_entity ($token->{name});
870 $node->set_user_data (manakai_source_line => $token->{line});
871 $node->set_user_data (manakai_source_column => $token->{column});
872
873 ## TODO: ...
874
875 $self->{doctype}->set_general_entity_node ($node);
876 } else {
877 ## TODO: ...
878
879 }
880
881 ## Stay in the mode.
882 $token = $self->_get_next_token;
883 next B;
884 } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
885 ## TODO: ...
886
887 ## Stay in the mode.
888 $token = $self->_get_next_token;
889 next B;
890 } elsif ($token->{type} == NOTATION_TOKEN) {
891 unless ($self->{doctype}->get_notation_node
892 ($token->{name})) {
893 my $node = $self->{document}->create_notation ($token->{name});
894 $node->set_user_data (manakai_source_line => $token->{line});
895 $node->set_user_data (manakai_source_column => $token->{column});
896
897 ## TODO: ...
898
899 $self->{doctype}->set_notation_node ($node);
900 } else {
901 ## TODO: ...
902
903 }
904
905 ## Stay in the mode.
906 $token = $self->_get_next_token;
907 next B;
908 } elsif ($token->{type} == PI_TOKEN) {
909 my $pi = $self->{document}->create_processing_instruction
910 ($token->{target}, $token->{data});
911 $self->{doctype}->append_child ($pi);
912 ## TODO: line/col
913
914 ## Stay in the mode.
915 $token = $self->_get_next_token;
916 next B;
917 } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
918 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
919 $token = $self->_get_next_token;
920 return;
921 } elsif ($token->{type} == ABORT_TOKEN) {
922 return;
923 } else {
924 die "$0: XML parser subset im: Unknown token type $token->{type}";
925 }
926 } # B
927
928 } # _tree_in_subset
929
930 }
931
932 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24