/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (show annotations) (download) (as text)
Sun Oct 19 13:43:56 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.20: +79 -14 lines
File MIME type: application/x-wais-source
++ whatpm/t/xml/ChangeLog	19 Oct 2008 13:43:45 -0000
	* attlists-1.dat: Test results updated.  New tests on empty
	attlist declaration and duplications are added.

	* doctypes-2.dat: Test results updated.

	* eldecls-1.dat, entities-2.dat, notations-1.dat: New tests on
	duplications are added.

	* entities-1.dat: New tests on duplications and predefined
	entities are added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 13:40:35 -0000
	* Tokenizer.pm.src: Column number counting fixed.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 13:41:50 -0000
	* Parser.pm.src: Raise a parse error or warning for
	declaration/definition duplications.  Raise a warning for an empty
	attlist declaration.  Raise a error for an ill-declared predefined
	entity.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68 !!!cp ('j1');
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72 !!!cp ('j2');
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82 !!!cp ('j4');
83 !!!parse-error (type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 !!!parse-error (layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199
200 $self->{ge}->{'amp;'} = {value => '&', only_text => 1};
201 $self->{ge}->{'apos;'} = {value => "'", only_text => 1};
202 $self->{ge}->{'gt;'} = {value => '>', only_text => 1};
203 $self->{ge}->{'lt;'} = {value => '<', only_text => 1};
204 $self->{ge}->{'quot;'} = {value => '"', only_text => 1};
205 } # _initialize_tree_constructor
206
207 sub _terminate_tree_constructor ($) {
208 my $self = shift;
209 $self->{document}->strict_error_checking (1);
210 $self->{document}->dom_config
211 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
212 = 1;
213 ## TODO: Turn mutation events on
214 } # _terminate_tree_constructor
215
216 ## Tree construction stage
217
218
219 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
220
221 ## XML5: No namespace support.
222
223 ## XML5: Start, main, end phases. In this implementation, they are
224 ## represented by insertion modes.
225
226 ## Insertion modes
227 sub INITIAL_IM () { 0 }
228 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
229 sub IN_ELEMENT_IM () { 2 }
230 sub AFTER_ROOT_ELEMENT_IM () { 3 }
231 sub IN_SUBSET_IM () { 4 }
232
233 {
234 my $token; ## TODO: change to $self->{t}
235
236 sub _construct_tree ($) {
237 my ($self) = @_;
238
239 delete $self->{tainted};
240 $self->{open_elements} = [];
241 $self->{insertion_mode} = INITIAL_IM;
242
243 !!!next-token;
244
245 ## XML5: No support for the XML declaration
246 if ($token->{type} == PI_TOKEN and
247 $token->{target} eq 'xml' and
248 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
249 (?>"([^"]*)"|'([^']*)')
250 (?:[\x09\x0A\x20]+
251 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
252 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
253 (?:[\x09\x0A\x20]+
254 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
255 (?>"(yes|no)"|'(yes|no)'))?
256 [\x09\x0A\x20]*\z/x) {
257 $self->{document}->xml_version (defined $1 ? $1 : $2);
258 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
259 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
260
261 !!!next-token;
262 } else {
263 $self->{document}->xml_version ('1.0');
264 $self->{document}->xml_encoding (undef);
265 $self->{document}->xml_standalone (0);
266 }
267
268 while (1) {
269 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
270 $self->_tree_in_element;
271 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
272 $self->_tree_in_subset;
273 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
274 $self->_tree_after_root_element;
275 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
276 $self->_tree_before_root_element;
277 } elsif ($self->{insertion_mode} == INITIAL_IM) {
278 $self->_tree_initial;
279 } else {
280 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
281 }
282
283 last if $token->{type} == ABORT_TOKEN;
284 }
285 } # _construct_tree
286
287 sub _tree_initial ($) {
288 my $self = shift;
289
290 B: while (1) {
291 if ($token->{type} == DOCTYPE_TOKEN) {
292 ## XML5: No "DOCTYPE" token.
293
294 my $doctype = $self->{document}->create_document_type_definition
295 (defined $token->{name} ? $token->{name} : '');
296
297 ## NOTE: Default value for both |public_id| and |system_id| attributes
298 ## are empty strings, so that we don't set any value in missing cases.
299 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
300 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
301
302 ## TODO: internal_subset
303
304 $self->{document}->append_child ($doctype);
305
306 $self->{ge} = {};
307
308 ## XML5: No "has internal subset" flag.
309 if ($token->{has_internal_subset}) {
310 $self->{doctype} = $doctype;
311 $self->{insertion_mode} = IN_SUBSET_IM;
312 } else {
313 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
314 }
315 !!!next-token;
316 return;
317 } elsif ($token->{type} == START_TAG_TOKEN or
318 $token->{type} == END_OF_FILE_TOKEN) {
319 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
320 ## Reprocess.
321 return;
322 } elsif ($token->{type} == COMMENT_TOKEN) {
323 my $comment = $self->{document}->create_comment ($token->{data});
324 $self->{document}->append_child ($comment);
325
326 ## Stay in the mode.
327 !!!next-token;
328 next B;
329 } elsif ($token->{type} == PI_TOKEN) {
330 my $pi = $self->{document}->create_processing_instruction
331 ($token->{target}, $token->{data});
332 $self->{document}->append_child ($pi);
333
334 ## Stay in the mode.
335 !!!next-token;
336 next B;
337 } elsif ($token->{type} == CHARACTER_TOKEN) {
338 if (not $self->{tainted} and
339 not $token->{has_reference} and
340 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
341 #
342 }
343
344 if (length $token->{data}) {
345 ## XML5: Ignore the token.
346
347 unless ($self->{tainted}) {
348 !!!parse-error (type => 'text outside of root element',
349 token => $token);
350 $self->{tainted} = 1;
351 }
352
353 $self->{document}->manakai_append_text ($token->{data});
354 }
355
356 ## Stay in the mode.
357 !!!next-token;
358 next B;
359 } elsif ($token->{type} == END_TAG_TOKEN) {
360 !!!parse-error (type => 'unmatched end tag',
361 text => $token->{tag_name},
362 token => $token);
363 ## Ignore the token.
364
365 ## Stay in the mode.
366 !!!next-token;
367 next B;
368 } elsif ($token->{type} == ABORT_TOKEN) {
369 return;
370 } else {
371 die "$0: XML parser initial: Unknown token type $token->{type}";
372 }
373 } # B
374 } # _tree_initial
375
376 sub _tree_before_root_element ($) {
377 my $self = shift;
378
379 B: while (1) {
380 if ($token->{type} == START_TAG_TOKEN) {
381 my $nsmap = {
382 xml => q<http://www.w3.org/XML/1998/namespace>,
383 xmlns => q<http://www.w3.org/2000/xmlns/>,
384 };
385
386 for (keys %{$token->{attributes}}) {
387 if (/^xmlns:./s) {
388 my $prefix = substr $_, 6;
389 my $value = $token->{attributes}->{$_}->{value};
390 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
391 $value eq q<http://www.w3.org/XML/1998/namespace> or
392 $value eq q<http://www.w3.org/2000/xmlns/>) {
393 ## NOTE: Error should be detected at the DOM layer.
394 #
395 } elsif (length $value) {
396 $nsmap->{$prefix} = $value;
397 } else {
398 delete $nsmap->{$prefix};
399 }
400 } elsif ($_ eq 'xmlns') {
401 my $value = $token->{attributes}->{$_}->{value};
402 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
403 $value eq q<http://www.w3.org/2000/xmlns/>) {
404 ## NOTE: Error should be detected at the DOM layer.
405 #
406 } elsif (length $value) {
407 $nsmap->{''} = $value;
408 } else {
409 delete $nsmap->{''};
410 }
411 }
412 }
413
414 my $ns;
415 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
416
417 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
418 if (defined $nsmap->{$prefix}) {
419 $ns = $nsmap->{$prefix};
420 } else {
421 ($prefix, $ln) = (undef, $token->{tag_name});
422 }
423 } else {
424 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
425 ($prefix, $ln) = (undef, $token->{tag_name});
426 }
427
428 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
429 $el->set_user_data (manakai_source_line => $token->{line});
430 $el->set_user_data (manakai_source_column => $token->{column});
431
432 my $has_attr;
433 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
434 $token->{attributes}->{$b}->{index}}
435 keys %{$token->{attributes}}) {
436 my $ns;
437 my ($p, $l) = split /:/, $attr_name, 2;
438
439 if ($attr_name eq 'xmlns:xmlns') {
440 ($p, $l) = (undef, $attr_name);
441 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
442 if (defined $nsmap->{$p}) {
443 $ns = $nsmap->{$p};
444 } else {
445 ## NOTE: Error should be detected at the DOM-layer.
446 ($p, $l) = (undef, $attr_name);
447 }
448 } else {
449 if ($attr_name eq 'xmlns') {
450 $ns = $nsmap->{xmlns};
451 }
452 ($p, $l) = (undef, $attr_name);
453 }
454
455 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
456 $ns = undef;
457 ($p, $l) = (undef, $attr_name);
458 } else {
459 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
460 }
461
462 my $attr_t = $token->{attributes}->{$attr_name};
463 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
464 $attr->value ($attr_t->{value});
465 $attr->set_user_data (manakai_source_line => $attr_t->{line});
466 $attr->set_user_data (manakai_source_column => $attr_t->{column});
467 $el->set_attribute_node_ns ($attr);
468 }
469
470 $self->{document}->append_child ($el);
471
472 if ($self->{self_closing}) {
473 !!!ack ('ack');
474 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
475 } else {
476 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
477 $self->{insertion_mode} = IN_ELEMENT_IM;
478 }
479
480 #delete $self->{tainted};
481
482 !!!next-token;
483 return;
484 } elsif ($token->{type} == COMMENT_TOKEN) {
485 my $comment = $self->{document}->create_comment ($token->{data});
486 $self->{document}->append_child ($comment);
487
488 ## Stay in the mode.
489 !!!next-token;
490 next B;
491 } elsif ($token->{type} == PI_TOKEN) {
492 my $pi = $self->{document}->create_processing_instruction
493 ($token->{target}, $token->{data});
494 $self->{document}->append_child ($pi);
495
496 ## Stay in the mode.
497 !!!next-token;
498 next B;
499 } elsif ($token->{type} == CHARACTER_TOKEN) {
500 if (not $self->{tainted} and
501 not $token->{has_reference} and
502 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
503 #
504 }
505
506 if (length $token->{data}) {
507 ## XML5: Ignore the token.
508
509 unless ($self->{tainted}) {
510 !!!parse-error (type => 'text outside of root element',
511 token => $token);
512 $self->{tainted} = 1;
513 }
514
515 $self->{document}->manakai_append_text ($token->{data});
516 }
517
518 ## Stay in the mode.
519 !!!next-token;
520 next B;
521 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
522 !!!parse-error (type => 'no root element',
523 token => $token);
524
525 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
526 ## Reprocess.
527 return;
528 } elsif ($token->{type} == END_TAG_TOKEN) {
529 !!!parse-error (type => 'unmatched end tag',
530 text => $token->{tag_name},
531 token => $token);
532 ## Ignore the token.
533
534 ## Stay in the mode.
535 !!!next-token;
536 next B;
537 } elsif ($token->{type} == DOCTYPE_TOKEN) {
538 !!!parse-error (type => 'in html:#doctype',
539 token => $token);
540 ## Ignore the token.
541
542 ## Stay in the mode.
543 !!!next-token;
544 next B;
545 } elsif ($token->{type} == ABORT_TOKEN) {
546 return;
547 } else {
548 die "$0: XML parser initial: Unknown token type $token->{type}";
549 }
550 } # B
551 } # _tree_before_root_element
552
553 sub _tree_in_element ($) {
554 my $self = shift;
555
556 B: while (1) {
557 if ($token->{type} == CHARACTER_TOKEN) {
558 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
559
560 ## Stay in the mode.
561 !!!next-token;
562 next B;
563 } elsif ($token->{type} == START_TAG_TOKEN) {
564 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
565
566 for (keys %{$token->{attributes}}) {
567 if (/^xmlns:./s) {
568 my $prefix = substr $_, 6;
569 my $value = $token->{attributes}->{$_}->{value};
570 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
571 $value eq q<http://www.w3.org/XML/1998/namespace> or
572 $value eq q<http://www.w3.org/2000/xmlns/>) {
573 ## NOTE: Error should be detected at the DOM layer.
574 #
575 } elsif (length $value) {
576 $nsmap->{$prefix} = $value;
577 } else {
578 delete $nsmap->{$prefix};
579 }
580 } elsif ($_ eq 'xmlns') {
581 my $value = $token->{attributes}->{$_}->{value};
582 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
583 $value eq q<http://www.w3.org/2000/xmlns/>) {
584 ## NOTE: Error should be detected at the DOM layer.
585 #
586 } elsif (length $value) {
587 $nsmap->{''} = $value;
588 } else {
589 delete $nsmap->{''};
590 }
591 }
592 }
593
594 my $ns;
595 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
596
597 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
598 if (defined $nsmap->{$prefix}) {
599 $ns = $nsmap->{$prefix};
600 } else {
601 ## NOTE: Error should be detected at the DOM layer.
602 ($prefix, $ln) = (undef, $token->{tag_name});
603 }
604 } else {
605 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
606 ($prefix, $ln) = (undef, $token->{tag_name});
607 }
608
609 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
610 $el->set_user_data (manakai_source_line => $token->{line});
611 $el->set_user_data (manakai_source_column => $token->{column});
612
613 my $has_attr;
614 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
615 $token->{attributes}->{$b}->{index}}
616 keys %{$token->{attributes}}) {
617 my $ns;
618 my ($p, $l) = split /:/, $attr_name, 2;
619
620 if ($attr_name eq 'xmlns:xmlns') {
621 ($p, $l) = (undef, $attr_name);
622 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
623 if (defined $nsmap->{$p}) {
624 $ns = $nsmap->{$p};
625 } else {
626 ## NOTE: Error should be detected at the DOM-layer.
627 ($p, $l) = (undef, $attr_name);
628 }
629 } else {
630 if ($attr_name eq 'xmlns') {
631 $ns = $nsmap->{xmlns};
632 }
633 ($p, $l) = (undef, $attr_name);
634 }
635
636 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
637 $ns = undef;
638 ($p, $l) = (undef, $attr_name);
639 } else {
640 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
641 }
642
643 my $attr_t = $token->{attributes}->{$attr_name};
644 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
645 $attr->value ($attr_t->{value});
646 $attr->set_user_data (manakai_source_line => $attr_t->{line});
647 $attr->set_user_data (manakai_source_column => $attr_t->{column});
648 $el->set_attribute_node_ns ($attr);
649 }
650
651 $self->{open_elements}->[-1]->[0]->append_child ($el);
652
653 if ($self->{self_closing}) {
654 !!!ack ('ack');
655 } else {
656 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
657 }
658
659 ## Stay in the mode.
660 !!!next-token;
661 next B;
662 } elsif ($token->{type} == END_TAG_TOKEN) {
663 if ($token->{tag_name} eq '') {
664 ## Short end tag token.
665 pop @{$self->{open_elements}};
666 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
667 pop @{$self->{open_elements}};
668 } else {
669 !!!parse-error (type => 'unmatched end tag',
670 text => $token->{tag_name},
671 token => $token);
672
673 ## Has an element in scope
674 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
675 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
676 splice @{$self->{open_elements}}, $i;
677 last INSCOPE;
678 }
679 } # INSCOPE
680 }
681
682 unless (@{$self->{open_elements}}) {
683 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
684 !!!next-token;
685 return;
686 } else {
687 ## Stay in the state.
688 !!!next-token;
689 redo B;
690 }
691 } elsif ($token->{type} == COMMENT_TOKEN) {
692 my $comment = $self->{document}->create_comment ($token->{data});
693 $self->{open_elements}->[-1]->[0]->append_child ($comment);
694
695 ## Stay in the mode.
696 !!!next-token;
697 next B;
698 } elsif ($token->{type} == PI_TOKEN) {
699 my $pi = $self->{document}->create_processing_instruction
700 ($token->{target}, $token->{data});
701 $self->{open_elements}->[-1]->[0]->append_child ($pi);
702
703 ## Stay in the mode.
704 !!!next-token;
705 next B;
706 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
707 !!!parse-error (type => 'in body:#eof',
708 token => $token);
709
710 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
711 !!!next-token;
712 return;
713 } elsif ($token->{type} == DOCTYPE_TOKEN) {
714 !!!parse-error (type => 'in html:#doctype',
715 token => $token);
716 ## Ignore the token.
717
718 ## Stay in the mode.
719 !!!next-token;
720 next B;
721 } elsif ($token->{type} == ABORT_TOKEN) {
722 return;
723 } else {
724 die "$0: XML parser initial: Unknown token type $token->{type}";
725 }
726 } # B
727 } # _tree_in_element
728
729 sub _tree_after_root_element ($) {
730 my $self = shift;
731
732 B: while (1) {
733 if ($token->{type} == START_TAG_TOKEN) {
734 !!!parse-error (type => 'second root element',
735 token => $token);
736
737 ## XML5: Ignore the token.
738
739 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
740 ## Reprocess.
741 return;
742 } elsif ($token->{type} == COMMENT_TOKEN) {
743 my $comment = $self->{document}->create_comment ($token->{data});
744 $self->{document}->append_child ($comment);
745
746 ## Stay in the mode.
747 !!!next-token;
748 next B;
749 } elsif ($token->{type} == PI_TOKEN) {
750 my $pi = $self->{document}->create_processing_instruction
751 ($token->{target}, $token->{data});
752 $self->{document}->append_child ($pi);
753
754 ## Stay in the mode.
755 !!!next-token;
756 next B;
757 } elsif ($token->{type} == CHARACTER_TOKEN) {
758 if (not $self->{tainted} and
759 not $token->{has_reference} and
760 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
761 #
762 }
763
764 if (length $token->{data}) {
765 ## XML5: Ignore the token.
766
767 unless ($self->{tainted}) {
768 !!!parse-error (type => 'text outside of root element',
769 token => $token);
770 $self->{tainted} = 1;
771 }
772
773 $self->{document}->manakai_append_text ($token->{data});
774 }
775
776 ## Stay in the mode.
777 !!!next-token;
778 next B;
779 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
780 ## Stop parsing.
781
782 ## TODO: implement "stop parsing".
783
784 $token = {type => ABORT_TOKEN};
785 return;
786 } elsif ($token->{type} == END_TAG_TOKEN) {
787 !!!parse-error (type => 'unmatched end tag',
788 text => $token->{tag_name},
789 token => $token);
790 ## Ignore the token.
791
792 ## Stay in the mode.
793 !!!next-token;
794 next B;
795 } elsif ($token->{type} == DOCTYPE_TOKEN) {
796 !!!parse-error (type => 'in html:#doctype',
797 token => $token);
798 ## Ignore the token.
799
800 ## Stay in the mode.
801 !!!next-token;
802 next B;
803 } elsif ($token->{type} == ABORT_TOKEN) {
804 return;
805 } else {
806 die "$0: XML parser initial: Unknown token type $token->{type}";
807 }
808 } # B
809 } # _tree_after_root_element
810
811 sub _tree_in_subset ($) {
812 my $self = shift;
813
814 B: while (1) {
815 if ($token->{type} == COMMENT_TOKEN) {
816 ## Ignore the token.
817
818 ## Stay in the state.
819 !!!next-token;
820 next B;
821 } elsif ($token->{type} == ELEMENT_TOKEN) {
822 unless ($self->{has_element_decl}->{$token->{name}}) {
823 my $node = $self->{doctype}->get_element_type_definition_node
824 ($token->{name});
825 unless ($node) {
826 $node = $self->{document}->create_element_type_definition
827 ($token->{name});
828 $self->{doctype}->set_element_type_definition_node ($node);
829 }
830
831 $node->set_user_data (manakai_source_line => $token->{line});
832 $node->set_user_data (manakai_source_column => $token->{column});
833
834 $node->content_model_text (join '', @{$token->{content}})
835 if $token->{content};
836 } else {
837 !!!parse-error (type => 'duplicate element decl', ## TODO: type
838 value => $token->{name},
839 token => $token);
840
841 ## TODO: $token->{content} syntax check.
842 }
843 $self->{has_element_decl}->{$token->{name}} = 1;
844
845 ## Stay in the mode.
846 !!!next-token;
847 next B;
848 } elsif ($token->{type} == ATTLIST_TOKEN) {
849 my $ed = $self->{doctype}->get_element_type_definition_node
850 ($token->{name});
851 unless ($ed) {
852 $ed = $self->{document}->create_element_type_definition
853 ($token->{name});
854 $ed->set_user_data (manakai_source_line => $token->{line});
855 $ed->set_user_data (manakai_source_column => $token->{column});
856 $self->{doctype}->set_element_type_definition_node ($ed);
857 } elsif ($self->{has_attlist}->{$token->{name}}) {
858 !!!parse-error (type => 'duplicate attlist decl', ## TODO: type
859 value => $token->{name},
860 token => $token,
861 level => $self->{level}->{warn});
862 }
863 $self->{has_attlist}->{$token->{name}} = 1;
864
865 unless (@{$token->{attrdefs}}) {
866 !!!parse-error (type => 'empty attlist decl', ## TODO: type
867 value => $token->{name},
868 token => $token,
869 level => $self->{level}->{warn});
870 }
871
872 for my $at (@{$token->{attrdefs}}) {
873 unless ($ed->get_attribute_definition_node ($at->{name})) {
874 my $node = $self->{document}->create_attribute_definition
875 ($at->{name});
876 $node->set_user_data (manakai_source_line => $at->{line});
877 $node->set_user_data (manakai_source_column => $at->{column});
878
879 my $type = defined $at->{type} ? {
880 CDATA => 1, ID => 2, IDREF => 3, IDREFS => 4, ENTITY => 5,
881 ENTITIES => 6, NMTOKEN => 7, NMTOKENS => 8, NOTATION => 9,
882 }->{$at->{type}} : 10;
883 if (defined $type) {
884 $node->declared_type ($type);
885 } else {
886 !!!parse-error (type => 'unknown declared type', ## TODO: type
887 value => $at->{type},
888 token => $at);
889 }
890
891 push @{$node->allowed_tokens}, @{$at->{tokens}};
892
893 my $default = defined $at->{default} ? {
894 FIXED => 1, REQUIRED => 2, IMPLIED => 3,
895 }->{$at->{default}} : 4;
896 if (defined $default) {
897 $node->default_type ($default);
898 if (defined $at->{value}) {
899 if ($default == 1 or $default == 4) {
900 #
901 } elsif (length $at->{value}) {
902 !!!parse-error (type => 'default value not allowed', ## TODO: type
903 token => $at);
904 }
905 } else {
906 if ($default == 1 or $default == 4) {
907 !!!parse-error (type => 'default value not provided', ## TODO: type
908 token => $at);
909 }
910 }
911 } else {
912 !!!parse-error (type => 'unknown default type', ## TODO: type
913 value => $at->{default},
914 token => $at);
915 }
916
917 $node->text_content ($at->{value}) if defined $at->{value};
918
919 $ed->set_attribute_definition_node ($node);
920 } else {
921 !!!parse-error (type => 'duplicate attrdef', ## TODO: type
922 value => $at->{name},
923 token => $at,
924 level => $self->{level}->{warn});
925
926 ## TODO: syntax validation
927 }
928 } # $at
929
930 ## Stay in the mode.
931 !!!next-token;
932 next B;
933 } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
934 if ({
935 amp => 1, apos => 1, quot => 1, lt => 1, gt => 1,
936 }->{$token->{name}}) {
937 if (not defined $token->{value} or
938 $token->{value} !~
939 {
940 amp => qr/\A&#(?:x0*26|0*38);\z/,
941 lt => qr/\A&#(?:x0*3[Cc]|0*60);\z/,
942 gt => qr/\A(?>&#(?:x0*3[Ee]|0*62);|>)\z/,
943 quot => qr/\A(?>&#(?:x0*22|0*34);|")\z/,
944 apos => qr/\A(?>&#(?:x0*27|0*39);|')\z/,
945 }->{$token->{name}}) {
946 !!!parse-error (type => 'bad predefined entity decl', ## TODO: type
947 value => $token->{name},
948 token => $token);
949 }
950
951 $self->{ge}->{$token->{name}.';'} = {name => $token->{name},
952 value => {
953 amp => '&',
954 lt => '<',
955 gt => '>',
956 quot => '"',
957 apos => "'",
958 }->{$token->{name}},
959 only_text => 1};
960 } elsif (not $self->{ge}->{$token->{name}.';'}) {
961 ## For parser.
962 $self->{ge}->{$token->{name}.';'} = $token;
963 if (defined $token->{value} and
964 $token->{value} !~ /[&<]/) {
965 $token->{only_text} = 1;
966 }
967
968 ## For DOM.
969 if (defined $token->{notation}) {
970 my $node = $self->{document}->create_general_entity ($token->{name});
971 $node->set_user_data (manakai_source_line => $token->{line});
972 $node->set_user_data (manakai_source_column => $token->{column});
973
974 $node->public_id ($token->{pubid}); # may be undef
975 $node->system_id ($token->{sysid}); # may be undef
976 $node->notation_name ($token->{notation});
977
978 $self->{doctype}->set_general_entity_node ($node);
979 } else {
980 ## TODO: syntax validation
981 }
982 } else {
983 !!!parse-error (type => 'duplicate general entity decl', ## TODO: type
984 value => $token->{name},
985 token => $token,
986 level => $self->{level}->{warn});
987
988 ## TODO: syntax validation
989 }
990
991 ## Stay in the mode.
992 !!!next-token;
993 next B;
994 } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
995 unless ($self->{pe}->{$token->{name}}) {
996 ## For parser.
997 $self->{pe}->{$token->{name}} = $token;
998
999 ## TODO: syntax validation
1000 } else {
1001 !!!parse-error (type => 'duplicate para entity decl', ## TODO: type
1002 value => $token->{name},
1003 token => $token,
1004 level => $self->{level}->{warn});
1005
1006 ## TODO: syntax validation
1007 }
1008
1009 ## Stay in the mode.
1010 !!!next-token;
1011 next B;
1012 } elsif ($token->{type} == NOTATION_TOKEN) {
1013 unless ($self->{doctype}->get_notation_node
1014 ($token->{name})) {
1015 my $node = $self->{document}->create_notation ($token->{name});
1016 $node->set_user_data (manakai_source_line => $token->{line});
1017 $node->set_user_data (manakai_source_column => $token->{column});
1018
1019 $node->public_id ($token->{pubid}); # may be undef
1020 $node->system_id ($token->{sysid}); # may be undef
1021
1022 $self->{doctype}->set_notation_node ($node);
1023 } else {
1024 !!!parse-error (type => 'duplicate notation decl', ## TODO: type
1025 value => $token->{name},
1026 token => $token);
1027
1028 ## TODO: syntax validation
1029 }
1030
1031 ## Stay in the mode.
1032 !!!next-token;
1033 next B;
1034 } elsif ($token->{type} == PI_TOKEN) {
1035 my $pi = $self->{document}->create_processing_instruction
1036 ($token->{target}, $token->{data});
1037 $self->{doctype}->append_child ($pi);
1038 ## TODO: line/col
1039
1040 ## Stay in the mode.
1041 !!!next-token;
1042 next B;
1043 } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
1044 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
1045 !!!next-token;
1046 return;
1047 } elsif ($token->{type} == ABORT_TOKEN) {
1048 return;
1049 } else {
1050 die "$0: XML parser subset im: Unknown token type $token->{type}";
1051 }
1052 } # B
1053
1054 } # _tree_in_subset
1055
1056 }
1057
1058 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24