/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (show annotations) (download)
Sun Oct 19 14:05:20 2008 UTC (17 years, 8 months ago) by wakaba
Branch: MAIN
Changes since 1.21: +86 -78 lines
++ whatpm/t/xml/ChangeLog	19 Oct 2008 14:05:17 -0000
	* attlist-1.dat, eldecls-1.dat, entities-1.dat, entities-2.dat,
	notations-1.dat, pis-2.dat: Unexpanded parameter entity tests are
	added.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/HTML/ChangeLog	19 Oct 2008 14:03:50 -0000
	* Tokenizer.pm.src: Set the "stop_processing" flag true when a
	parameter entity occurs in a standalone="no" document.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/XML/ChangeLog	19 Oct 2008 14:04:25 -0000
	* Parser.pm.src: Don't process ATTLIST_TOKEN and ENTITY_TOKEN if
	the "stop_processing" flag is set.

2008-10-19  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82
83 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199
200 $self->{ge}->{'amp;'} = {value => '&', only_text => 1};
201 $self->{ge}->{'apos;'} = {value => "'", only_text => 1};
202 $self->{ge}->{'gt;'} = {value => '>', only_text => 1};
203 $self->{ge}->{'lt;'} = {value => '<', only_text => 1};
204 $self->{ge}->{'quot;'} = {value => '"', only_text => 1};
205 } # _initialize_tree_constructor
206
207 sub _terminate_tree_constructor ($) {
208 my $self = shift;
209 $self->{document}->strict_error_checking (1);
210 $self->{document}->dom_config
211 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
212 = 1;
213 ## TODO: Turn mutation events on
214 } # _terminate_tree_constructor
215
216 ## Tree construction stage
217
218
219 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
220
221 ## XML5: No namespace support.
222
223 ## XML5: Start, main, end phases. In this implementation, they are
224 ## represented by insertion modes.
225
226 ## Insertion modes
227 sub INITIAL_IM () { 0 }
228 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
229 sub IN_ELEMENT_IM () { 2 }
230 sub AFTER_ROOT_ELEMENT_IM () { 3 }
231 sub IN_SUBSET_IM () { 4 }
232
233 {
234 my $token; ## TODO: change to $self->{t}
235
236 sub _construct_tree ($) {
237 my ($self) = @_;
238
239 delete $self->{tainted};
240 $self->{open_elements} = [];
241 $self->{insertion_mode} = INITIAL_IM;
242
243 $token = $self->_get_next_token;
244
245 ## XML5: No support for the XML declaration
246 if ($token->{type} == PI_TOKEN and
247 $token->{target} eq 'xml' and
248 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
249 (?>"([^"]*)"|'([^']*)')
250 (?:[\x09\x0A\x20]+
251 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
252 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
253 (?:[\x09\x0A\x20]+
254 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
255 (?>"(yes|no)"|'(yes|no)'))?
256 [\x09\x0A\x20]*\z/x) {
257 $self->{document}->xml_version (defined $1 ? $1 : $2);
258 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
259 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
260
261 $token = $self->_get_next_token;
262 } else {
263 $self->{document}->xml_version ('1.0');
264 $self->{document}->xml_encoding (undef);
265 $self->{document}->xml_standalone (0);
266 }
267
268 while (1) {
269 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
270 $self->_tree_in_element;
271 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
272 $self->_tree_in_subset;
273 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
274 $self->_tree_after_root_element;
275 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
276 $self->_tree_before_root_element;
277 } elsif ($self->{insertion_mode} == INITIAL_IM) {
278 $self->_tree_initial;
279 } else {
280 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
281 }
282
283 last if $token->{type} == ABORT_TOKEN;
284 }
285 } # _construct_tree
286
287 sub _tree_initial ($) {
288 my $self = shift;
289
290 B: while (1) {
291 if ($token->{type} == DOCTYPE_TOKEN) {
292 ## XML5: No "DOCTYPE" token.
293
294 my $doctype = $self->{document}->create_document_type_definition
295 (defined $token->{name} ? $token->{name} : '');
296
297 ## NOTE: Default value for both |public_id| and |system_id| attributes
298 ## are empty strings, so that we don't set any value in missing cases.
299 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
300 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
301
302 ## TODO: internal_subset
303
304 $self->{document}->append_child ($doctype);
305
306 $self->{ge} = {};
307
308 ## XML5: No "has internal subset" flag.
309 if ($token->{has_internal_subset}) {
310 $self->{doctype} = $doctype;
311 $self->{insertion_mode} = IN_SUBSET_IM;
312 } else {
313 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
314 }
315 $token = $self->_get_next_token;
316 return;
317 } elsif ($token->{type} == START_TAG_TOKEN or
318 $token->{type} == END_OF_FILE_TOKEN) {
319 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
320 ## Reprocess.
321 return;
322 } elsif ($token->{type} == COMMENT_TOKEN) {
323 my $comment = $self->{document}->create_comment ($token->{data});
324 $self->{document}->append_child ($comment);
325
326 ## Stay in the mode.
327 $token = $self->_get_next_token;
328 next B;
329 } elsif ($token->{type} == PI_TOKEN) {
330 my $pi = $self->{document}->create_processing_instruction
331 ($token->{target}, $token->{data});
332 $self->{document}->append_child ($pi);
333
334 ## Stay in the mode.
335 $token = $self->_get_next_token;
336 next B;
337 } elsif ($token->{type} == CHARACTER_TOKEN) {
338 if (not $self->{tainted} and
339 not $token->{has_reference} and
340 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
341 #
342 }
343
344 if (length $token->{data}) {
345 ## XML5: Ignore the token.
346
347 unless ($self->{tainted}) {
348 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
349 token => $token);
350 $self->{tainted} = 1;
351 }
352
353 $self->{document}->manakai_append_text ($token->{data});
354 }
355
356 ## Stay in the mode.
357 $token = $self->_get_next_token;
358 next B;
359 } elsif ($token->{type} == END_TAG_TOKEN) {
360 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
361 text => $token->{tag_name},
362 token => $token);
363 ## Ignore the token.
364
365 ## Stay in the mode.
366 $token = $self->_get_next_token;
367 next B;
368 } elsif ($token->{type} == ABORT_TOKEN) {
369 return;
370 } else {
371 die "$0: XML parser initial: Unknown token type $token->{type}";
372 }
373 } # B
374 } # _tree_initial
375
376 sub _tree_before_root_element ($) {
377 my $self = shift;
378
379 B: while (1) {
380 if ($token->{type} == START_TAG_TOKEN) {
381 my $nsmap = {
382 xml => q<http://www.w3.org/XML/1998/namespace>,
383 xmlns => q<http://www.w3.org/2000/xmlns/>,
384 };
385
386 for (keys %{$token->{attributes}}) {
387 if (/^xmlns:./s) {
388 my $prefix = substr $_, 6;
389 my $value = $token->{attributes}->{$_}->{value};
390 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
391 $value eq q<http://www.w3.org/XML/1998/namespace> or
392 $value eq q<http://www.w3.org/2000/xmlns/>) {
393 ## NOTE: Error should be detected at the DOM layer.
394 #
395 } elsif (length $value) {
396 $nsmap->{$prefix} = $value;
397 } else {
398 delete $nsmap->{$prefix};
399 }
400 } elsif ($_ eq 'xmlns') {
401 my $value = $token->{attributes}->{$_}->{value};
402 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
403 $value eq q<http://www.w3.org/2000/xmlns/>) {
404 ## NOTE: Error should be detected at the DOM layer.
405 #
406 } elsif (length $value) {
407 $nsmap->{''} = $value;
408 } else {
409 delete $nsmap->{''};
410 }
411 }
412 }
413
414 my $ns;
415 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
416
417 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
418 if (defined $nsmap->{$prefix}) {
419 $ns = $nsmap->{$prefix};
420 } else {
421 ($prefix, $ln) = (undef, $token->{tag_name});
422 }
423 } else {
424 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
425 ($prefix, $ln) = (undef, $token->{tag_name});
426 }
427
428 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
429 $el->set_user_data (manakai_source_line => $token->{line});
430 $el->set_user_data (manakai_source_column => $token->{column});
431
432 my $has_attr;
433 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
434 $token->{attributes}->{$b}->{index}}
435 keys %{$token->{attributes}}) {
436 my $ns;
437 my ($p, $l) = split /:/, $attr_name, 2;
438
439 if ($attr_name eq 'xmlns:xmlns') {
440 ($p, $l) = (undef, $attr_name);
441 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
442 if (defined $nsmap->{$p}) {
443 $ns = $nsmap->{$p};
444 } else {
445 ## NOTE: Error should be detected at the DOM-layer.
446 ($p, $l) = (undef, $attr_name);
447 }
448 } else {
449 if ($attr_name eq 'xmlns') {
450 $ns = $nsmap->{xmlns};
451 }
452 ($p, $l) = (undef, $attr_name);
453 }
454
455 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
456 $ns = undef;
457 ($p, $l) = (undef, $attr_name);
458 } else {
459 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
460 }
461
462 my $attr_t = $token->{attributes}->{$attr_name};
463 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
464 $attr->value ($attr_t->{value});
465 $attr->set_user_data (manakai_source_line => $attr_t->{line});
466 $attr->set_user_data (manakai_source_column => $attr_t->{column});
467 $el->set_attribute_node_ns ($attr);
468 }
469
470 $self->{document}->append_child ($el);
471
472 if ($self->{self_closing}) {
473 delete $self->{self_closing};
474 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
475 } else {
476 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
477 $self->{insertion_mode} = IN_ELEMENT_IM;
478 }
479
480 #delete $self->{tainted};
481
482 $token = $self->_get_next_token;
483 return;
484 } elsif ($token->{type} == COMMENT_TOKEN) {
485 my $comment = $self->{document}->create_comment ($token->{data});
486 $self->{document}->append_child ($comment);
487
488 ## Stay in the mode.
489 $token = $self->_get_next_token;
490 next B;
491 } elsif ($token->{type} == PI_TOKEN) {
492 my $pi = $self->{document}->create_processing_instruction
493 ($token->{target}, $token->{data});
494 $self->{document}->append_child ($pi);
495
496 ## Stay in the mode.
497 $token = $self->_get_next_token;
498 next B;
499 } elsif ($token->{type} == CHARACTER_TOKEN) {
500 if (not $self->{tainted} and
501 not $token->{has_reference} and
502 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
503 #
504 }
505
506 if (length $token->{data}) {
507 ## XML5: Ignore the token.
508
509 unless ($self->{tainted}) {
510 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
511 token => $token);
512 $self->{tainted} = 1;
513 }
514
515 $self->{document}->manakai_append_text ($token->{data});
516 }
517
518 ## Stay in the mode.
519 $token = $self->_get_next_token;
520 next B;
521 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
522 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no root element',
523 token => $token);
524
525 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
526 ## Reprocess.
527 return;
528 } elsif ($token->{type} == END_TAG_TOKEN) {
529 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
530 text => $token->{tag_name},
531 token => $token);
532 ## Ignore the token.
533
534 ## Stay in the mode.
535 $token = $self->_get_next_token;
536 next B;
537 } elsif ($token->{type} == DOCTYPE_TOKEN) {
538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
539 token => $token);
540 ## Ignore the token.
541
542 ## Stay in the mode.
543 $token = $self->_get_next_token;
544 next B;
545 } elsif ($token->{type} == ABORT_TOKEN) {
546 return;
547 } else {
548 die "$0: XML parser initial: Unknown token type $token->{type}";
549 }
550 } # B
551 } # _tree_before_root_element
552
553 sub _tree_in_element ($) {
554 my $self = shift;
555
556 B: while (1) {
557 if ($token->{type} == CHARACTER_TOKEN) {
558 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
559
560 ## Stay in the mode.
561 $token = $self->_get_next_token;
562 next B;
563 } elsif ($token->{type} == START_TAG_TOKEN) {
564 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
565
566 for (keys %{$token->{attributes}}) {
567 if (/^xmlns:./s) {
568 my $prefix = substr $_, 6;
569 my $value = $token->{attributes}->{$_}->{value};
570 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
571 $value eq q<http://www.w3.org/XML/1998/namespace> or
572 $value eq q<http://www.w3.org/2000/xmlns/>) {
573 ## NOTE: Error should be detected at the DOM layer.
574 #
575 } elsif (length $value) {
576 $nsmap->{$prefix} = $value;
577 } else {
578 delete $nsmap->{$prefix};
579 }
580 } elsif ($_ eq 'xmlns') {
581 my $value = $token->{attributes}->{$_}->{value};
582 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
583 $value eq q<http://www.w3.org/2000/xmlns/>) {
584 ## NOTE: Error should be detected at the DOM layer.
585 #
586 } elsif (length $value) {
587 $nsmap->{''} = $value;
588 } else {
589 delete $nsmap->{''};
590 }
591 }
592 }
593
594 my $ns;
595 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
596
597 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
598 if (defined $nsmap->{$prefix}) {
599 $ns = $nsmap->{$prefix};
600 } else {
601 ## NOTE: Error should be detected at the DOM layer.
602 ($prefix, $ln) = (undef, $token->{tag_name});
603 }
604 } else {
605 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
606 ($prefix, $ln) = (undef, $token->{tag_name});
607 }
608
609 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
610 $el->set_user_data (manakai_source_line => $token->{line});
611 $el->set_user_data (manakai_source_column => $token->{column});
612
613 my $has_attr;
614 for my $attr_name (sort {$token->{attributes}->{$a}->{index} <=>
615 $token->{attributes}->{$b}->{index}}
616 keys %{$token->{attributes}}) {
617 my $ns;
618 my ($p, $l) = split /:/, $attr_name, 2;
619
620 if ($attr_name eq 'xmlns:xmlns') {
621 ($p, $l) = (undef, $attr_name);
622 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
623 if (defined $nsmap->{$p}) {
624 $ns = $nsmap->{$p};
625 } else {
626 ## NOTE: Error should be detected at the DOM-layer.
627 ($p, $l) = (undef, $attr_name);
628 }
629 } else {
630 if ($attr_name eq 'xmlns') {
631 $ns = $nsmap->{xmlns};
632 }
633 ($p, $l) = (undef, $attr_name);
634 }
635
636 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
637 $ns = undef;
638 ($p, $l) = (undef, $attr_name);
639 } else {
640 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
641 }
642
643 my $attr_t = $token->{attributes}->{$attr_name};
644 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
645 $attr->value ($attr_t->{value});
646 $attr->set_user_data (manakai_source_line => $attr_t->{line});
647 $attr->set_user_data (manakai_source_column => $attr_t->{column});
648 $el->set_attribute_node_ns ($attr);
649 }
650
651 $self->{open_elements}->[-1]->[0]->append_child ($el);
652
653 if ($self->{self_closing}) {
654 delete $self->{self_closing};
655 } else {
656 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
657 }
658
659 ## Stay in the mode.
660 $token = $self->_get_next_token;
661 next B;
662 } elsif ($token->{type} == END_TAG_TOKEN) {
663 if ($token->{tag_name} eq '') {
664 ## Short end tag token.
665 pop @{$self->{open_elements}};
666 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
667 pop @{$self->{open_elements}};
668 } else {
669 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
670 text => $token->{tag_name},
671 token => $token);
672
673 ## Has an element in scope
674 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
675 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
676 splice @{$self->{open_elements}}, $i;
677 last INSCOPE;
678 }
679 } # INSCOPE
680 }
681
682 unless (@{$self->{open_elements}}) {
683 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
684 $token = $self->_get_next_token;
685 return;
686 } else {
687 ## Stay in the state.
688 $token = $self->_get_next_token;
689 redo B;
690 }
691 } elsif ($token->{type} == COMMENT_TOKEN) {
692 my $comment = $self->{document}->create_comment ($token->{data});
693 $self->{open_elements}->[-1]->[0]->append_child ($comment);
694
695 ## Stay in the mode.
696 $token = $self->_get_next_token;
697 next B;
698 } elsif ($token->{type} == PI_TOKEN) {
699 my $pi = $self->{document}->create_processing_instruction
700 ($token->{target}, $token->{data});
701 $self->{open_elements}->[-1]->[0]->append_child ($pi);
702
703 ## Stay in the mode.
704 $token = $self->_get_next_token;
705 next B;
706 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
707 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof',
708 token => $token);
709
710 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
711 $token = $self->_get_next_token;
712 return;
713 } elsif ($token->{type} == DOCTYPE_TOKEN) {
714 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
715 token => $token);
716 ## Ignore the token.
717
718 ## Stay in the mode.
719 $token = $self->_get_next_token;
720 next B;
721 } elsif ($token->{type} == ABORT_TOKEN) {
722 return;
723 } else {
724 die "$0: XML parser initial: Unknown token type $token->{type}";
725 }
726 } # B
727 } # _tree_in_element
728
729 sub _tree_after_root_element ($) {
730 my $self = shift;
731
732 B: while (1) {
733 if ($token->{type} == START_TAG_TOKEN) {
734 $self->{parse_error}->(level => $self->{level}->{must}, type => 'second root element',
735 token => $token);
736
737 ## XML5: Ignore the token.
738
739 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
740 ## Reprocess.
741 return;
742 } elsif ($token->{type} == COMMENT_TOKEN) {
743 my $comment = $self->{document}->create_comment ($token->{data});
744 $self->{document}->append_child ($comment);
745
746 ## Stay in the mode.
747 $token = $self->_get_next_token;
748 next B;
749 } elsif ($token->{type} == PI_TOKEN) {
750 my $pi = $self->{document}->create_processing_instruction
751 ($token->{target}, $token->{data});
752 $self->{document}->append_child ($pi);
753
754 ## Stay in the mode.
755 $token = $self->_get_next_token;
756 next B;
757 } elsif ($token->{type} == CHARACTER_TOKEN) {
758 if (not $self->{tainted} and
759 not $token->{has_reference} and
760 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
761 #
762 }
763
764 if (length $token->{data}) {
765 ## XML5: Ignore the token.
766
767 unless ($self->{tainted}) {
768 $self->{parse_error}->(level => $self->{level}->{must}, type => 'text outside of root element',
769 token => $token);
770 $self->{tainted} = 1;
771 }
772
773 $self->{document}->manakai_append_text ($token->{data});
774 }
775
776 ## Stay in the mode.
777 $token = $self->_get_next_token;
778 next B;
779 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
780 ## Stop parsing.
781
782 ## TODO: implement "stop parsing".
783
784 $token = {type => ABORT_TOKEN};
785 return;
786 } elsif ($token->{type} == END_TAG_TOKEN) {
787 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag',
788 text => $token->{tag_name},
789 token => $token);
790 ## Ignore the token.
791
792 ## Stay in the mode.
793 $token = $self->_get_next_token;
794 next B;
795 } elsif ($token->{type} == DOCTYPE_TOKEN) {
796 $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#doctype',
797 token => $token);
798 ## Ignore the token.
799
800 ## Stay in the mode.
801 $token = $self->_get_next_token;
802 next B;
803 } elsif ($token->{type} == ABORT_TOKEN) {
804 return;
805 } else {
806 die "$0: XML parser initial: Unknown token type $token->{type}";
807 }
808 } # B
809 } # _tree_after_root_element
810
811 sub _tree_in_subset ($) {
812 my $self = shift;
813
814 B: while (1) {
815 if ($token->{type} == COMMENT_TOKEN) {
816 ## Ignore the token.
817
818 ## Stay in the state.
819 $token = $self->_get_next_token;
820 next B;
821 } elsif ($token->{type} == ELEMENT_TOKEN) {
822 unless ($self->{has_element_decl}->{$token->{name}}) {
823 my $node = $self->{doctype}->get_element_type_definition_node
824 ($token->{name});
825 unless ($node) {
826 $node = $self->{document}->create_element_type_definition
827 ($token->{name});
828 $self->{doctype}->set_element_type_definition_node ($node);
829 }
830
831 $node->set_user_data (manakai_source_line => $token->{line});
832 $node->set_user_data (manakai_source_column => $token->{column});
833
834 $node->content_model_text (join '', @{$token->{content}})
835 if $token->{content};
836 } else {
837 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate element decl', ## TODO: type
838 value => $token->{name},
839 token => $token);
840
841 ## TODO: $token->{content} syntax check.
842 }
843 $self->{has_element_decl}->{$token->{name}} = 1;
844
845 ## Stay in the mode.
846 $token = $self->_get_next_token;
847 next B;
848 } elsif ($token->{type} == ATTLIST_TOKEN) {
849 if ($self->{stop_processing}) {
850 ## TODO: syntax validation
851 } else {
852 my $ed = $self->{doctype}->get_element_type_definition_node
853 ($token->{name});
854 unless ($ed) {
855 $ed = $self->{document}->create_element_type_definition
856 ($token->{name});
857 $ed->set_user_data (manakai_source_line => $token->{line});
858 $ed->set_user_data (manakai_source_column => $token->{column});
859 $self->{doctype}->set_element_type_definition_node ($ed);
860 } elsif ($self->{has_attlist}->{$token->{name}}) {
861 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attlist decl', ## TODO: type
862 value => $token->{name},
863 token => $token,
864 level => $self->{level}->{warn});
865 }
866 $self->{has_attlist}->{$token->{name}} = 1;
867
868 unless (@{$token->{attrdefs}}) {
869 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty attlist decl', ## TODO: type
870 value => $token->{name},
871 token => $token,
872 level => $self->{level}->{warn});
873 }
874
875 for my $at (@{$token->{attrdefs}}) {
876 unless ($ed->get_attribute_definition_node ($at->{name})) {
877 my $node = $self->{document}->create_attribute_definition
878 ($at->{name});
879 $node->set_user_data (manakai_source_line => $at->{line});
880 $node->set_user_data (manakai_source_column => $at->{column});
881
882 my $type = defined $at->{type} ? {
883 CDATA => 1, ID => 2, IDREF => 3, IDREFS => 4, ENTITY => 5,
884 ENTITIES => 6, NMTOKEN => 7, NMTOKENS => 8, NOTATION => 9,
885 }->{$at->{type}} : 10;
886 if (defined $type) {
887 $node->declared_type ($type);
888 } else {
889 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unknown declared type', ## TODO: type
890 value => $at->{type},
891 token => $at);
892 }
893
894 push @{$node->allowed_tokens}, @{$at->{tokens}};
895
896 my $default = defined $at->{default} ? {
897 FIXED => 1, REQUIRED => 2, IMPLIED => 3,
898 }->{$at->{default}} : 4;
899 if (defined $default) {
900 $node->default_type ($default);
901 if (defined $at->{value}) {
902 if ($default == 1 or $default == 4) {
903 #
904 } elsif (length $at->{value}) {
905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'default value not allowed', ## TODO: type
906 token => $at);
907 }
908 } else {
909 if ($default == 1 or $default == 4) {
910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'default value not provided', ## TODO: type
911 token => $at);
912 }
913 }
914 } else {
915 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unknown default type', ## TODO: type
916 value => $at->{default},
917 token => $at);
918 }
919
920 $node->text_content ($at->{value}) if defined $at->{value};
921
922 $ed->set_attribute_definition_node ($node);
923 } else {
924 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attrdef', ## TODO: type
925 value => $at->{name},
926 token => $at,
927 level => $self->{level}->{warn});
928
929 ## TODO: syntax validation
930 }
931 } # $at
932 }
933
934 ## Stay in the mode.
935 $token = $self->_get_next_token;
936 next B;
937 } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
938 if ($self->{stop_processing}) {
939 ## TODO: syntax validation
940 } elsif ({
941 amp => 1, apos => 1, quot => 1, lt => 1, gt => 1,
942 }->{$token->{name}}) {
943 if (not defined $token->{value} or
944 $token->{value} !~
945 {
946 amp => qr/\A&#(?:x0*26|0*38);\z/,
947 lt => qr/\A&#(?:x0*3[Cc]|0*60);\z/,
948 gt => qr/\A(?>&#(?:x0*3[Ee]|0*62);|>)\z/,
949 quot => qr/\A(?>&#(?:x0*22|0*34);|")\z/,
950 apos => qr/\A(?>&#(?:x0*27|0*39);|')\z/,
951 }->{$token->{name}}) {
952 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad predefined entity decl', ## TODO: type
953 value => $token->{name},
954 token => $token);
955 }
956
957 $self->{ge}->{$token->{name}.';'} = {name => $token->{name},
958 value => {
959 amp => '&',
960 lt => '<',
961 gt => '>',
962 quot => '"',
963 apos => "'",
964 }->{$token->{name}},
965 only_text => 1};
966 } elsif (not $self->{ge}->{$token->{name}.';'}) {
967 ## For parser.
968 $self->{ge}->{$token->{name}.';'} = $token;
969 if (defined $token->{value} and
970 $token->{value} !~ /[&<]/) {
971 $token->{only_text} = 1;
972 }
973
974 ## For DOM.
975 if (defined $token->{notation}) {
976 my $node = $self->{document}->create_general_entity ($token->{name});
977 $node->set_user_data (manakai_source_line => $token->{line});
978 $node->set_user_data (manakai_source_column => $token->{column});
979
980 $node->public_id ($token->{pubid}); # may be undef
981 $node->system_id ($token->{sysid}); # may be undef
982 $node->notation_name ($token->{notation});
983
984 $self->{doctype}->set_general_entity_node ($node);
985 } else {
986 ## TODO: syntax validation
987 }
988 } else {
989 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate general entity decl', ## TODO: type
990 value => $token->{name},
991 token => $token,
992 level => $self->{level}->{warn});
993
994 ## TODO: syntax validation
995 }
996
997 ## Stay in the mode.
998 $token = $self->_get_next_token;
999 next B;
1000 } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
1001 if ($self->{stop_processing}) {
1002 ## TODO: syntax validation
1003 } elsif (not $self->{pe}->{$token->{name}}) {
1004 ## For parser.
1005 $self->{pe}->{$token->{name}} = $token;
1006
1007 ## TODO: syntax validation
1008 } else {
1009 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate para entity decl', ## TODO: type
1010 value => $token->{name},
1011 token => $token,
1012 level => $self->{level}->{warn});
1013
1014 ## TODO: syntax validation
1015 }
1016
1017 ## Stay in the mode.
1018 $token = $self->_get_next_token;
1019 next B;
1020 } elsif ($token->{type} == NOTATION_TOKEN) {
1021 unless ($self->{doctype}->get_notation_node
1022 ($token->{name})) {
1023 my $node = $self->{document}->create_notation ($token->{name});
1024 $node->set_user_data (manakai_source_line => $token->{line});
1025 $node->set_user_data (manakai_source_column => $token->{column});
1026
1027 $node->public_id ($token->{pubid}); # may be undef
1028 $node->system_id ($token->{sysid}); # may be undef
1029
1030 $self->{doctype}->set_notation_node ($node);
1031 } else {
1032 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate notation decl', ## TODO: type
1033 value => $token->{name},
1034 token => $token);
1035
1036 ## TODO: syntax validation
1037 }
1038
1039 ## Stay in the mode.
1040 $token = $self->_get_next_token;
1041 next B;
1042 } elsif ($token->{type} == PI_TOKEN) {
1043 my $pi = $self->{document}->create_processing_instruction
1044 ($token->{target}, $token->{data});
1045 $self->{doctype}->append_child ($pi);
1046 ## TODO: line/col
1047
1048 ## Stay in the mode.
1049 $token = $self->_get_next_token;
1050 next B;
1051 } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
1052 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
1053 $token = $self->_get_next_token;
1054 return;
1055 } elsif ($token->{type} == ABORT_TOKEN) {
1056 return;
1057 } else {
1058 die "$0: XML parser subset im: Unknown token type $token->{type}";
1059 }
1060 } # B
1061
1062 } # _tree_in_subset
1063
1064 }
1065
1066 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24