/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (show annotations) (download)
Wed Oct 15 12:49:49 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.12: +2 -12 lines
++ whatpm/t/ChangeLog	15 Oct 2008 12:49:07 -0000
	* XML-Parser.t: "xml/doctypes-2.dat" added.

	* tokenizer-test-1.test: Keyword case-sensitivility tests added.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/t/xml/ChangeLog	15 Oct 2008 12:49:41 -0000
	* doctypes-1.dat: A keyword case-sensitivility test added.

	* doctypes-2.dat: New test data file.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	15 Oct 2008 12:46:53 -0000
	* Tokenizer.pm.src: $self->{s_kwd} for non-DATA_STATE states are
	renamed as $self->{kwd} to avoid confliction.  Don't raise
	case-sensitivity error for the keyword "DOCTYPE" in HTML mode.
	Support for internal subsets (internal subset itself only; no
	declaration in them is supported yet).  Raise a parse error for
	non-uppercase keywords "PUBLIC" and "SYSTEM" in XML mode.  Raise a
	parse error if no system identifier is specified for a DOCTYPE
	declaration with a public identifier.  Don't close the DOCTYPE
	declaration by a ">" character in the system declaration in XML
	mode.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	15 Oct 2008 12:48:30 -0000
	* Parser.pm.src: Typo fixed.

2008-10-15  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82
83 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199 } # _initialize_tree_constructor
200
201 sub _terminate_tree_constructor ($) {
202 my $self = shift;
203 $self->{document}->strict_error_checking (1);
204 $self->{document}->dom_config
205 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
206 = 1;
207 ## TODO: Turn mutation events on
208 } # _terminate_tree_constructor
209
210 ## Tree construction stage
211
212
213 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
214
215 ## XML5: No namespace support.
216
217 ## XML5: Start, main, end phases. In this implementation, they are
218 ## represented by insertion modes.
219
220 ## Insertion modes
221 sub INITIAL_IM () { 0 }
222 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
223 sub IN_ELEMENT_IM () { 2 }
224 sub AFTER_ROOT_ELEMENT_IM () { 3 }
225
226 {
227 my $token; ## TODO: change to $self->{t}
228
229 sub _construct_tree ($) {
230 my ($self) = @_;
231
232 delete $self->{tainted};
233 $self->{open_elements} = [];
234 $self->{insertion_mode} = INITIAL_IM;
235
236 $token = $self->_get_next_token;
237
238 ## XML5: No support for the XML declaration
239 if ($token->{type} == PI_TOKEN and
240 $token->{target} eq 'xml' and
241 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
242 (?>"([^"]*)"|'([^']*)')
243 (?:[\x09\x0A\x20]+
244 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
245 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
246 (?:[\x09\x0A\x20]+
247 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
248 (?>"(yes|no)"|'(yes|no)'))?
249 [\x09\x0A\x20]*\z/x) {
250 $self->{document}->xml_version (defined $1 ? $1 : $2);
251 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
252 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
253
254 $token = $self->_get_next_token;
255 } else {
256 $self->{document}->xml_version ('1.0');
257 $self->{document}->xml_encoding (undef);
258 $self->{document}->xml_standalone (0);
259 }
260
261 while (1) {
262 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
263 $self->_tree_in_element;
264 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
265 $self->_tree_after_root_element;
266 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
267 $self->_tree_before_root_element;
268 } elsif ($self->{insertion_mode} == INITIAL_IM) {
269 $self->_tree_initial;
270 } else {
271 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
272 }
273
274 last if $token->{type} == ABORT_TOKEN;
275 }
276 } # _construct_tree
277
278 sub _tree_initial ($) {
279 my $self = shift;
280
281 B: while (1) {
282 if ($token->{type} == DOCTYPE_TOKEN) {
283 ## XML5: No "DOCTYPE" token.
284
285 my $doctype = $self->{document}->create_document_type_definition
286 (defined $token->{name} ? $token->{name} : '');
287
288 ## NOTE: Default value for both |public_id| and |system_id| attributes
289 ## are empty strings, so that we don't set any value in missing cases.
290 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
291 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
292
293 ## TODO: internal_subset
294
295 $self->{document}->append_child ($doctype);
296
297 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
298 $token = $self->_get_next_token;
299 return;
300 } elsif ($token->{type} == START_TAG_TOKEN or
301 $token->{type} == END_OF_FILE_TOKEN) {
302 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
303 ## Reprocess.
304 return;
305 } elsif ($token->{type} == COMMENT_TOKEN) {
306 my $comment = $self->{document}->create_comment ($token->{data});
307 $self->{document}->append_child ($comment);
308
309 ## Stay in the mode.
310 $token = $self->_get_next_token;
311 next B;
312 } elsif ($token->{type} == PI_TOKEN) {
313 my $pi = $self->{document}->create_processing_instruction
314 ($token->{target}, $token->{data});
315 $self->{document}->append_child ($pi);
316
317 ## Stay in the mode.
318 $token = $self->_get_next_token;
319 next B;
320 } elsif ($token->{type} == CHARACTER_TOKEN) {
321 if (not $self->{tainted} and
322 not $token->{has_reference} and
323 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
324 #
325 }
326
327 if (length $token->{data}) {
328 ## XML5: Ignore the token.
329
330 unless ($self->{tainted}) {
331 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
332 token => $token);
333 $self->{tainted} = 1;
334 }
335
336 $self->{document}->manakai_append_text ($token->{data});
337 }
338
339 ## Stay in the mode.
340 $token = $self->_get_next_token;
341 next B;
342 } elsif ($token->{type} == END_TAG_TOKEN) {
343 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
344 text => $token->{tag_name},
345 token => $token);
346 ## Ignore the token.
347
348 ## Stay in the mode.
349 $token = $self->_get_next_token;
350 next B;
351 } elsif ($token->{type} == ABORT_TOKEN) {
352 return;
353 } else {
354 die "$0: XML parser initial: Unknown token type $token->{type}";
355 }
356 } # B
357 } # _tree_initial
358
359 sub _tree_before_root_element ($) {
360 my $self = shift;
361
362 B: while (1) {
363 if ($token->{type} == START_TAG_TOKEN) {
364 my $nsmap = {
365 xml => q<http://www.w3.org/XML/1998/namespace>,
366 xmlns => q<http://www.w3.org/2000/xmlns/>,
367 };
368
369 for (keys %{$token->{attributes}}) {
370 if (/^xmlns:./s) {
371 my $prefix = substr $_, 6;
372 my $value = $token->{attributes}->{$_}->{value};
373 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
374 $value eq q<http://www.w3.org/XML/1998/namespace> or
375 $value eq q<http://www.w3.org/2000/xmlns/>) {
376 ## NOTE: Error should be detected at the DOM layer.
377 #
378 } elsif (length $value) {
379 $nsmap->{$prefix} = $value;
380 } else {
381 delete $nsmap->{$prefix};
382 }
383 } elsif ($_ eq 'xmlns') {
384 my $value = $token->{attributes}->{$_}->{value};
385 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
386 $value eq q<http://www.w3.org/2000/xmlns/>) {
387 ## NOTE: Error should be detected at the DOM layer.
388 #
389 } elsif (length $value) {
390 $nsmap->{''} = $value;
391 } else {
392 delete $nsmap->{''};
393 }
394 }
395 }
396
397 my $ns;
398 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
399
400 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
401 if (defined $nsmap->{$prefix}) {
402 $ns = $nsmap->{$prefix};
403 } else {
404 ($prefix, $ln) = (undef, $token->{tag_name});
405 }
406 } else {
407 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
408 ($prefix, $ln) = (undef, $token->{tag_name});
409 }
410
411 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
412 $el->set_user_data (manakai_source_line => $token->{line});
413 $el->set_user_data (manakai_source_column => $token->{column});
414
415 my $has_attr;
416 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
417 $token->{attributes}->{$b}->{index}}
418 keys %{$token->{attributes}}) {
419 my $ns;
420 my ($p, $l) = split /:/, $attr_name, 2;
421
422 if ($attr_name eq 'xmlns:xmlns') {
423 ($p, $l) = (undef, $attr_name);
424 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
425 if (defined $nsmap->{$p}) {
426 $ns = $nsmap->{$p};
427 } else {
428 ## NOTE: Error should be detected at the DOM-layer.
429 ($p, $l) = (undef, $attr_name);
430 }
431 } else {
432 if ($attr_name eq 'xmlns') {
433 $ns = $nsmap->{xmlns};
434 }
435 ($p, $l) = (undef, $attr_name);
436 }
437
438 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
439 $ns = undef;
440 ($p, $l) = (undef, $attr_name);
441 } else {
442 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
443 }
444
445 my $attr_t = $token->{attributes}->{$attr_name};
446 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
447 $attr->value ($attr_t->{value});
448 $attr->set_user_data (manakai_source_line => $attr_t->{line});
449 $attr->set_user_data (manakai_source_column => $attr_t->{column});
450 $el->set_attribute_node_ns ($attr);
451 }
452
453 $self->{document}->append_child ($el);
454
455 if ($self->{self_closing}) {
456 delete $self->{self_closing};
457 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
458 } else {
459 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
460 $self->{insertion_mode} = IN_ELEMENT_IM;
461 }
462
463 #delete $self->{tainted};
464
465 $token = $self->_get_next_token;
466 return;
467 } elsif ($token->{type} == COMMENT_TOKEN) {
468 my $comment = $self->{document}->create_comment ($token->{data});
469 $self->{document}->append_child ($comment);
470
471 ## Stay in the mode.
472 $token = $self->_get_next_token;
473 next B;
474 } elsif ($token->{type} == PI_TOKEN) {
475 my $pi = $self->{document}->create_processing_instruction
476 ($token->{target}, $token->{data});
477 $self->{document}->append_child ($pi);
478
479 ## Stay in the mode.
480 $token = $self->_get_next_token;
481 next B;
482 } elsif ($token->{type} == CHARACTER_TOKEN) {
483 if (not $self->{tainted} and
484 not $token->{has_reference} and
485 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
486 #
487 }
488
489 if (length $token->{data}) {
490 ## XML5: Ignore the token.
491
492 unless ($self->{tainted}) {
493 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
494 token => $token);
495 $self->{tainted} = 1;
496 }
497
498 $self->{document}->manakai_append_text ($token->{data});
499 }
500
501 ## Stay in the mode.
502 $token = $self->_get_next_token;
503 next B;
504 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
505 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
506 token => $token);
507
508 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
509 ## Reprocess.
510 return;
511 } elsif ($token->{type} == END_TAG_TOKEN) {
512 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
513 text => $token->{tag_name},
514 token => $token);
515 ## Ignore the token.
516
517 ## Stay in the mode.
518 $token = $self->_get_next_token;
519 next B;
520 } elsif ($token->{type} == DOCTYPE_TOKEN) {
521 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
522 token => $token);
523 ## Ignore the token.
524
525 ## Stay in the mode.
526 $token = $self->_get_next_token;
527 next B;
528 } elsif ($token->{type} == ABORT_TOKEN) {
529 return;
530 } else {
531 die "$0: XML parser initial: Unknown token type $token->{type}";
532 }
533 } # B
534 } # _tree_before_root_element
535
536 sub _tree_in_element ($) {
537 my $self = shift;
538
539 B: while (1) {
540 if ($token->{type} == CHARACTER_TOKEN) {
541 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
542
543 ## Stay in the mode.
544 $token = $self->_get_next_token;
545 next B;
546 } elsif ($token->{type} == START_TAG_TOKEN) {
547 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
548
549 for (keys %{$token->{attributes}}) {
550 if (/^xmlns:./s) {
551 my $prefix = substr $_, 6;
552 my $value = $token->{attributes}->{$_}->{value};
553 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
554 $value eq q<http://www.w3.org/XML/1998/namespace> or
555 $value eq q<http://www.w3.org/2000/xmlns/>) {
556 ## NOTE: Error should be detected at the DOM layer.
557 #
558 } elsif (length $value) {
559 $nsmap->{$prefix} = $value;
560 } else {
561 delete $nsmap->{$prefix};
562 }
563 } elsif ($_ eq 'xmlns') {
564 my $value = $token->{attributes}->{$_}->{value};
565 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
566 $value eq q<http://www.w3.org/2000/xmlns/>) {
567 ## NOTE: Error should be detected at the DOM layer.
568 #
569 } elsif (length $value) {
570 $nsmap->{''} = $value;
571 } else {
572 delete $nsmap->{''};
573 }
574 }
575 }
576
577 my $ns;
578 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
579
580 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
581 if (defined $nsmap->{$prefix}) {
582 $ns = $nsmap->{$prefix};
583 } else {
584 ## NOTE: Error should be detected at the DOM layer.
585 ($prefix, $ln) = (undef, $token->{tag_name});
586 }
587 } else {
588 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
589 ($prefix, $ln) = (undef, $token->{tag_name});
590 }
591
592 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
593 $el->set_user_data (manakai_source_line => $token->{line});
594 $el->set_user_data (manakai_source_column => $token->{column});
595
596 my $has_attr;
597 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
598 $token->{attributes}->{$b}->{index}}
599 keys %{$token->{attributes}}) {
600 my $ns;
601 my ($p, $l) = split /:/, $attr_name, 2;
602
603 if ($attr_name eq 'xmlns:xmlns') {
604 ($p, $l) = (undef, $attr_name);
605 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
606 if (defined $nsmap->{$p}) {
607 $ns = $nsmap->{$p};
608 } else {
609 ## NOTE: Error should be detected at the DOM-layer.
610 ($p, $l) = (undef, $attr_name);
611 }
612 } else {
613 if ($attr_name eq 'xmlns') {
614 $ns = $nsmap->{xmlns};
615 }
616 ($p, $l) = (undef, $attr_name);
617 }
618
619 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
620 $ns = undef;
621 ($p, $l) = (undef, $attr_name);
622 } else {
623 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
624 }
625
626 my $attr_t = $token->{attributes}->{$attr_name};
627 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
628 $attr->value ($attr_t->{value});
629 $attr->set_user_data (manakai_source_line => $attr_t->{line});
630 $attr->set_user_data (manakai_source_column => $attr_t->{column});
631 $el->set_attribute_node_ns ($attr);
632 }
633
634 $self->{open_elements}->[-1]->[0]->append_child ($el);
635
636 if ($self->{self_closing}) {
637 delete $self->{self_closing};
638 } else {
639 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
640 }
641
642 ## Stay in the mode.
643 $token = $self->_get_next_token;
644 next B;
645 } elsif ($token->{type} == END_TAG_TOKEN) {
646 if ($token->{tag_name} eq '') {
647 ## Short end tag token.
648 pop @{$self->{open_elements}};
649 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
650 pop @{$self->{open_elements}};
651 } else {
652 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
653 text => $token->{tag_name},
654 token => $token);
655
656 ## Has an element in scope
657 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
658 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
659 splice @{$self->{open_elements}}, $i;
660 last INSCOPE;
661 }
662 } # INSCOPE
663 }
664
665 unless (@{$self->{open_elements}}) {
666 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
667 $token = $self->_get_next_token;
668 return;
669 } else {
670 ## Stay in the state.
671 $token = $self->_get_next_token;
672 redo B;
673 }
674 } elsif ($token->{type} == COMMENT_TOKEN) {
675 my $comment = $self->{document}->create_comment ($token->{data});
676 $self->{open_elements}->[-1]->[0]->append_child ($comment);
677
678 ## Stay in the mode.
679 $token = $self->_get_next_token;
680 next B;
681 } elsif ($token->{type} == PI_TOKEN) {
682 my $pi = $self->{document}->create_processing_instruction
683 ($token->{target}, $token->{data});
684 $self->{open_elements}->[-1]->[0]->append_child ($pi);
685
686 ## Stay in the mode.
687 $token = $self->_get_next_token;
688 next B;
689 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
690 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
691 token => $token);
692
693 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
694 $token = $self->_get_next_token;
695 return;
696 } elsif ($token->{type} == DOCTYPE_TOKEN) {
697 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
698 token => $token);
699 ## Ignore the token.
700
701 ## Stay in the mode.
702 $token = $self->_get_next_token;
703 next B;
704 } elsif ($token->{type} == ABORT_TOKEN) {
705 return;
706 } else {
707 die "$0: XML parser initial: Unknown token type $token->{type}";
708 }
709 } # B
710 } # _tree_in_element
711
712 sub _tree_after_root_element ($) {
713 my $self = shift;
714
715 B: while (1) {
716 if ($token->{type} == START_TAG_TOKEN) {
717 $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
718 token => $token);
719
720 ## XML5: Ignore the token.
721
722 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
723 ## Reprocess.
724 return;
725 } elsif ($token->{type} == COMMENT_TOKEN) {
726 my $comment = $self->{document}->create_comment ($token->{data});
727 $self->{document}->append_child ($comment);
728
729 ## Stay in the mode.
730 $token = $self->_get_next_token;
731 next B;
732 } elsif ($token->{type} == PI_TOKEN) {
733 my $pi = $self->{document}->create_processing_instruction
734 ($token->{target}, $token->{data});
735 $self->{document}->append_child ($pi);
736
737 ## Stay in the mode.
738 $token = $self->_get_next_token;
739 next B;
740 } elsif ($token->{type} == CHARACTER_TOKEN) {
741 if (not $self->{tainted} and
742 not $token->{has_reference} and
743 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
744 #
745 }
746
747 if (length $token->{data}) {
748 ## XML5: Ignore the token.
749
750 unless ($self->{tainted}) {
751 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
752 token => $token);
753 $self->{tainted} = 1;
754 }
755
756 $self->{document}->manakai_append_text ($token->{data});
757 }
758
759 ## Stay in the mode.
760 $token = $self->_get_next_token;
761 next B;
762 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
763 ## Stop parsing.
764
765 ## TODO: implement "stop parsing".
766
767 $token = {type => ABORT_TOKEN};
768 return;
769 } elsif ($token->{type} == END_TAG_TOKEN) {
770 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
771 text => $token->{tag_name},
772 token => $token);
773 ## Ignore the token.
774
775 ## Stay in the mode.
776 $token = $self->_get_next_token;
777 next B;
778 } elsif ($token->{type} == DOCTYPE_TOKEN) {
779 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
780 token => $token);
781 ## Ignore the token.
782
783 ## Stay in the mode.
784 $token = $self->_get_next_token;
785 next B;
786 } elsif ($token->{type} == ABORT_TOKEN) {
787 return;
788 } else {
789 die "$0: XML parser initial: Unknown token type $token->{type}";
790 }
791 } # B
792 } # _tree_after_root_element
793
794 }
795
796 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24