/[suikacvs]/markup/html/whatpm/Whatpm/XML/Parser.pm.src
Suika

Contents of /markup/html/whatpm/Whatpm/XML/Parser.pm.src

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.23 - (show annotations) (download) (as text)
Mon Oct 20 04:21:18 2008 UTC (16 years, 9 months ago) by wakaba
Branch: MAIN
CVS Tags: HEAD
Changes since 1.22: +94 -16 lines
File MIME type: application/x-wais-source
++ whatpm/t/ChangeLog	20 Oct 2008 04:21:10 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* XML-Parser.t: "xml/attrs-2.dat" added.

++ whatpm/t/xml/ChangeLog	20 Oct 2008 04:17:22 -0000
	* attrs-2.dat: New test data file.

2008-10-20  Wakaba  <wakaba@suika.fam.cx>

++ whatpm/Whatpm/ChangeLog	20 Oct 2008 04:19:50 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* NanoDOM.pm (specified, all_declarations_processed,
	manakai_attribute_type): New attributes.

++ whatpm/Whatpm/XML/ChangeLog	20 Oct 2008 04:20:35 -0000
2008-10-20  Wakaba  <wakaba@suika.fam.cx>

	* Parser.pm.src: Support for attribute type assignments, attribute
	value tokenization, and default value assignments.

1 package Whatpm::XML::Parser;
2 use strict;
3
4 push our @ISA, 'Whatpm::HTML';
5 use Whatpm::HTML::Tokenizer qw/:token/;
6
7 sub parse_char_string ($$$;$$) {
8 #my ($self, $s, $doc, $onerror, $get_wrapper) = @_;
9 my $self = shift;
10 my $s = ref $_[0] ? $_[0] : \($_[0]);
11 require Whatpm::Charset::DecodeHandle;
12 my $input = Whatpm::Charset::DecodeHandle::CharString->new ($s);
13 return $self->parse_char_stream ($input, @_[1..$#_]);
14 } # parse_char_string
15
16 sub parse_char_stream ($$$;$$) {
17 my $self = ref $_[0] ? shift : shift->new;
18 my $input = $_[0];
19 $self->{document} = $_[1];
20 @{$self->{document}->child_nodes} = ();
21
22 ## NOTE: |set_inner_html| copies most of this method's code
23
24 $self->{confident} = 1 unless exists $self->{confident};
25 $self->{document}->input_encoding ($self->{input_encoding})
26 if defined $self->{input_encoding};
27 ## TODO: |{input_encoding}| is needless?
28
29 $self->{line_prev} = $self->{line} = 1;
30 $self->{column_prev} = -1;
31 $self->{column} = 0;
32 $self->{set_nc} = sub {
33 my $self = shift;
34
35 my $char = '';
36 if (defined $self->{next_nc}) {
37 $char = $self->{next_nc};
38 delete $self->{next_nc};
39 $self->{nc} = ord $char;
40 } else {
41 $self->{char_buffer} = '';
42 $self->{char_buffer_pos} = 0;
43
44 my $count = $input->manakai_read_until
45 ($self->{char_buffer}, qr/[^\x00\x0A\x0D]/, $self->{char_buffer_pos});
46 if ($count) {
47 $self->{line_prev} = $self->{line};
48 $self->{column_prev} = $self->{column};
49 $self->{column}++;
50 $self->{nc}
51 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1);
52 return;
53 }
54
55 if ($input->read ($char, 1)) {
56 $self->{nc} = ord $char;
57 } else {
58 $self->{nc} = -1;
59 return;
60 }
61 }
62
63 ($self->{line_prev}, $self->{column_prev})
64 = ($self->{line}, $self->{column});
65 $self->{column}++;
66
67 if ($self->{nc} == 0x000A) { # LF
68 !!!cp ('j1');
69 $self->{line}++;
70 $self->{column} = 0;
71 } elsif ($self->{nc} == 0x000D) { # CR
72 !!!cp ('j2');
73 ## TODO: support for abort/streaming
74 my $next = '';
75 if ($input->read ($next, 1) and $next ne "\x0A") {
76 $self->{next_nc} = $next;
77 }
78 $self->{nc} = 0x000A; # LF # MUST
79 $self->{line}++;
80 $self->{column} = 0;
81 } elsif ($self->{nc} == 0x0000) { # NULL
82 !!!cp ('j4');
83 !!!parse-error (type => 'NULL');
84 $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST
85 }
86 };
87
88 $self->{read_until} = sub {
89 #my ($scalar, $specials_range, $offset) = @_;
90 return 0 if defined $self->{next_nc};
91
92 my $pattern = qr/[^$_[1]\x00\x0A\x0D]/;
93 my $offset = $_[2] || 0;
94
95 if ($self->{char_buffer_pos} < length $self->{char_buffer}) {
96 pos ($self->{char_buffer}) = $self->{char_buffer_pos};
97 if ($self->{char_buffer} =~ /\G(?>$pattern)+/) {
98 substr ($_[0], $offset)
99 = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]);
100 my $count = $+[0] - $-[0];
101 if ($count) {
102 $self->{column} += $count;
103 $self->{char_buffer_pos} += $count;
104 $self->{line_prev} = $self->{line};
105 $self->{column_prev} = $self->{column} - 1;
106 $self->{nc} = -1;
107 }
108 return $count;
109 } else {
110 return 0;
111 }
112 } else {
113 my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]);
114 if ($count) {
115 $self->{column} += $count;
116 $self->{line_prev} = $self->{line};
117 $self->{column_prev} = $self->{column} - 1;
118 $self->{nc} = -1;
119 }
120 return $count;
121 }
122 }; # $self->{read_until}
123
124 my $onerror = $_[2] || sub {
125 my (%opt) = @_;
126 my $line = $opt{token} ? $opt{token}->{line} : $opt{line};
127 my $column = $opt{token} ? $opt{token}->{column} : $opt{column};
128 warn "Parse error ($opt{type}) at line $line column $column\n";
129 };
130 $self->{parse_error} = sub {
131 $onerror->(line => $self->{line}, column => $self->{column}, @_);
132 };
133
134 my $char_onerror = sub {
135 my (undef, $type, %opt) = @_;
136 !!!parse-error (layer => 'encode',
137 line => $self->{line}, column => $self->{column} + 1,
138 %opt, type => $type);
139 }; # $char_onerror
140
141 if ($_[3]) {
142 $input = $_[3]->($input);
143 $input->onerror ($char_onerror);
144 } else {
145 $input->onerror ($char_onerror) unless defined $input->onerror;
146 }
147
148 $self->_initialize_tokenizer;
149 $self->_initialize_tree_constructor;
150 $self->_construct_tree;
151 $self->_terminate_tree_constructor;
152
153 delete $self->{parse_error}; # remove loop
154
155 return $self->{document};
156 } # parse_char_stream
157
158 sub new ($) {
159 my $class = shift;
160 my $self = bless {
161 level => {must => 'm',
162 should => 's',
163 warn => 'w',
164 info => 'i',
165 uncertain => 'u'},
166 }, $class;
167 $self->{set_nc} = sub {
168 $self->{nc} = -1;
169 };
170 $self->{parse_error} = sub {
171 #
172 };
173 $self->{change_encoding} = sub {
174 # if ($_[0] is a supported encoding) {
175 # run "change the encoding" algorithm;
176 # throw Whatpm::HTML::RestartParser (charset => $new_encoding);
177 # }
178 };
179 $self->{application_cache_selection} = sub {
180 #
181 };
182
183 $self->{is_xml} = 1;
184
185 return $self;
186 } # new
187
188 sub _initialize_tree_constructor ($) {
189 my $self = shift;
190 ## NOTE: $self->{document} MUST be specified before this method is called
191 $self->{document}->strict_error_checking (0);
192 ## TODO: Turn mutation events off # MUST
193 $self->{document}->dom_config
194 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
195 = 0;
196 $self->{document}->manakai_is_html (0);
197 $self->{document}->set_user_data (manakai_source_line => 1);
198 $self->{document}->set_user_data (manakai_source_column => 1);
199
200 $self->{ge}->{'amp;'} = {value => '&', only_text => 1};
201 $self->{ge}->{'apos;'} = {value => "'", only_text => 1};
202 $self->{ge}->{'gt;'} = {value => '>', only_text => 1};
203 $self->{ge}->{'lt;'} = {value => '<', only_text => 1};
204 $self->{ge}->{'quot;'} = {value => '"', only_text => 1};
205 } # _initialize_tree_constructor
206
207 sub _terminate_tree_constructor ($) {
208 my $self = shift;
209 $self->{document}->strict_error_checking (1);
210 $self->{document}->dom_config
211 ->{'http://suika.fam.cx/www/2006/dom-config/strict-document-children'}
212 = 1;
213 ## TODO: Turn mutation events on
214 } # _terminate_tree_constructor
215
216 ## Tree construction stage
217
218
219 ## NOTE: Differences from the XML5 draft are marked as "XML5:".
220
221 ## XML5: No namespace support.
222
223 ## XML5: Start, main, end phases. In this implementation, they are
224 ## represented by insertion modes.
225
226 ## Insertion modes
227 sub INITIAL_IM () { 0 }
228 sub BEFORE_ROOT_ELEMENT_IM () { 1 }
229 sub IN_ELEMENT_IM () { 2 }
230 sub AFTER_ROOT_ELEMENT_IM () { 3 }
231 sub IN_SUBSET_IM () { 4 }
232
233 {
234 my $token; ## TODO: change to $self->{t}
235
236 sub _construct_tree ($) {
237 my ($self) = @_;
238
239 delete $self->{tainted};
240 $self->{open_elements} = [];
241 $self->{insertion_mode} = INITIAL_IM;
242
243 !!!next-token;
244
245 ## XML5: No support for the XML declaration
246 if ($token->{type} == PI_TOKEN and
247 $token->{target} eq 'xml' and
248 $token->{data} =~ /\Aversion[\x09\x0A\x20]*=[\x09\x0A\x20]*
249 (?>"([^"]*)"|'([^']*)')
250 (?:[\x09\x0A\x20]+
251 encoding[\x09\x0A\x20]*=[\x09\x0A\x20]*
252 (?>"([^"]*)"|'([^']*)')[\x09\x0A\x20]*)?
253 (?:[\x09\x0A\x20]+
254 standalone[\x09\x0A\x20]*=[\x09\x0A\x20]*
255 (?>"(yes|no)"|'(yes|no)'))?
256 [\x09\x0A\x20]*\z/x) {
257 $self->{document}->xml_version (defined $1 ? $1 : $2);
258 $self->{document}->xml_encoding (defined $3 ? $3 : $4); # possibly undef
259 $self->{document}->xml_standalone (($5 || $6 || 'no') ne 'no');
260
261 !!!next-token;
262 } else {
263 $self->{document}->xml_version ('1.0');
264 $self->{document}->xml_encoding (undef);
265 $self->{document}->xml_standalone (0);
266 }
267
268 while (1) {
269 if ($self->{insertion_mode} == IN_ELEMENT_IM) {
270 $self->_tree_in_element;
271 } elsif ($self->{insertion_mode} == IN_SUBSET_IM) {
272 $self->_tree_in_subset;
273 } elsif ($self->{insertion_mode} == AFTER_ROOT_ELEMENT_IM) {
274 $self->_tree_after_root_element;
275 } elsif ($self->{insertion_mode} == BEFORE_ROOT_ELEMENT_IM) {
276 $self->_tree_before_root_element;
277 } elsif ($self->{insertion_mode} == INITIAL_IM) {
278 $self->_tree_initial;
279 } else {
280 die "$0: Unknown XML insertion mode: $self->{insertion_mode}";
281 }
282
283 last if $token->{type} == ABORT_TOKEN;
284 }
285 } # _construct_tree
286
287 sub _tree_initial ($) {
288 my $self = shift;
289
290 B: while (1) {
291 if ($token->{type} == DOCTYPE_TOKEN) {
292 ## XML5: No "DOCTYPE" token.
293
294 my $doctype = $self->{document}->create_document_type_definition
295 (defined $token->{name} ? $token->{name} : '');
296
297 ## NOTE: Default value for both |public_id| and |system_id| attributes
298 ## are empty strings, so that we don't set any value in missing cases.
299 $doctype->public_id ($token->{pubid}) if defined $token->{pubid};
300 $doctype->system_id ($token->{sysid}) if defined $token->{sysid};
301
302 ## TODO: internal_subset
303
304 $self->{document}->append_child ($doctype);
305
306 $self->{ge} = {};
307
308 ## XML5: No "has internal subset" flag.
309 if ($token->{has_internal_subset}) {
310 $self->{doctype} = $doctype;
311 $self->{insertion_mode} = IN_SUBSET_IM;
312 } else {
313 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
314 }
315 !!!next-token;
316 return;
317 } elsif ($token->{type} == START_TAG_TOKEN or
318 $token->{type} == END_OF_FILE_TOKEN) {
319 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
320 ## Reprocess.
321 return;
322 } elsif ($token->{type} == COMMENT_TOKEN) {
323 my $comment = $self->{document}->create_comment ($token->{data});
324 $self->{document}->append_child ($comment);
325
326 ## Stay in the mode.
327 !!!next-token;
328 next B;
329 } elsif ($token->{type} == PI_TOKEN) {
330 my $pi = $self->{document}->create_processing_instruction
331 ($token->{target}, $token->{data});
332 $self->{document}->append_child ($pi);
333
334 ## Stay in the mode.
335 !!!next-token;
336 next B;
337 } elsif ($token->{type} == CHARACTER_TOKEN) {
338 if (not $self->{tainted} and
339 not $token->{has_reference} and
340 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
341 #
342 }
343
344 if (length $token->{data}) {
345 ## XML5: Ignore the token.
346
347 unless ($self->{tainted}) {
348 !!!parse-error (type => 'text outside of root element',
349 token => $token);
350 $self->{tainted} = 1;
351 }
352
353 $self->{document}->manakai_append_text ($token->{data});
354 }
355
356 ## Stay in the mode.
357 !!!next-token;
358 next B;
359 } elsif ($token->{type} == END_TAG_TOKEN) {
360 !!!parse-error (type => 'unmatched end tag',
361 text => $token->{tag_name},
362 token => $token);
363 ## Ignore the token.
364
365 ## Stay in the mode.
366 !!!next-token;
367 next B;
368 } elsif ($token->{type} == ABORT_TOKEN) {
369 return;
370 } else {
371 die "$0: XML parser initial: Unknown token type $token->{type}";
372 }
373 } # B
374 } # _tree_initial
375
376 sub _tree_before_root_element ($) {
377 my $self = shift;
378
379 B: while (1) {
380 if ($token->{type} == START_TAG_TOKEN) {
381 my $nsmap = {
382 xml => q<http://www.w3.org/XML/1998/namespace>,
383 xmlns => q<http://www.w3.org/2000/xmlns/>,
384 };
385
386 my $attrs = $token->{attributes};
387 my $attrdefs = $self->{attrdef}->{$token->{tag_name}};
388 for my $attr_name (keys %{$attrdefs}) {
389 if ($attrs->{$attr_name}) {
390 $attrs->{$attr_name}->{type} = $attrdefs->{$attr_name}->{type} || 0;
391 if ($attrdefs->{$attr_name}->{tokenize}) {
392 $attrs->{$attr_name}->{value} =~ s/ +/ /g;
393 $attrs->{$attr_name}->{value} =~ s/\A //;
394 $attrs->{$attr_name}->{value} =~ s/ \z//;
395 }
396 } elsif (defined $attrdefs->{$attr_name}->{default}) {
397 $attrs->{$attr_name} = {
398 value => $attrdefs->{$attr_name}->{default},
399 type => $attrdefs->{$attr_name}->{type} || 0,
400 not_specified => 1,
401 line => $attrdefs->{$attr_name}->{line},
402 column => $attrdefs->{$attr_name}->{column},
403 index => 1 + keys %{$attrs},
404 };
405 }
406 }
407
408 for (keys %{$attrs}) {
409 if (/^xmlns:./s) {
410 my $prefix = substr $_, 6;
411 my $value = $attrs->{$_}->{value};
412 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
413 $value eq q<http://www.w3.org/XML/1998/namespace> or
414 $value eq q<http://www.w3.org/2000/xmlns/>) {
415 ## NOTE: Error should be detected at the DOM layer.
416 #
417 } elsif (length $value) {
418 $nsmap->{$prefix} = $value;
419 } else {
420 delete $nsmap->{$prefix};
421 }
422 } elsif ($_ eq 'xmlns') {
423 my $value = $attrs->{$_}->{value};
424 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
425 $value eq q<http://www.w3.org/2000/xmlns/>) {
426 ## NOTE: Error should be detected at the DOM layer.
427 #
428 } elsif (length $value) {
429 $nsmap->{''} = $value;
430 } else {
431 delete $nsmap->{''};
432 }
433 }
434 }
435
436 my $ns;
437 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
438
439 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
440 if (defined $nsmap->{$prefix}) {
441 $ns = $nsmap->{$prefix};
442 } else {
443 ($prefix, $ln) = (undef, $token->{tag_name});
444 }
445 } else {
446 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
447 ($prefix, $ln) = (undef, $token->{tag_name});
448 }
449
450 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
451 $el->set_user_data (manakai_source_line => $token->{line});
452 $el->set_user_data (manakai_source_column => $token->{column});
453
454 my $has_attr;
455 for my $attr_name (sort {$attrs->{$a}->{index} <=> $attrs->{$b}->{index}}
456 keys %{$attrs}) {
457 my $ns;
458 my ($p, $l) = split /:/, $attr_name, 2;
459
460 if ($attr_name eq 'xmlns:xmlns') {
461 ($p, $l) = (undef, $attr_name);
462 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
463 if (defined $nsmap->{$p}) {
464 $ns = $nsmap->{$p};
465 } else {
466 ## NOTE: Error should be detected at the DOM-layer.
467 ($p, $l) = (undef, $attr_name);
468 }
469 } else {
470 if ($attr_name eq 'xmlns') {
471 $ns = $nsmap->{xmlns};
472 }
473 ($p, $l) = (undef, $attr_name);
474 }
475
476 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
477 $ns = undef;
478 ($p, $l) = (undef, $attr_name);
479 } else {
480 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
481 }
482
483 my $attr_t = $attrs->{$attr_name};
484 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
485 $attr->value ($attr_t->{value});
486 if (defined $attr_t->{type}) {
487 $attr->manakai_attribute_type ($attr_t->{type});
488 } elsif ($self->{document}->all_declarations_processed) {
489 $attr->manakai_attribute_type (0); # no value
490 } else {
491 $attr->manakai_attribute_type (11); # unknown
492 }
493 $attr->set_user_data (manakai_source_line => $attr_t->{line});
494 $attr->set_user_data (manakai_source_column => $attr_t->{column});
495 $el->set_attribute_node_ns ($attr);
496 $attr->specified (0) if $attr_t->{not_specified};
497 }
498
499 $self->{document}->append_child ($el);
500
501 if ($self->{self_closing}) {
502 !!!ack ('ack');
503 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
504 } else {
505 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
506 $self->{insertion_mode} = IN_ELEMENT_IM;
507 }
508
509 #delete $self->{tainted};
510
511 !!!next-token;
512 return;
513 } elsif ($token->{type} == COMMENT_TOKEN) {
514 my $comment = $self->{document}->create_comment ($token->{data});
515 $self->{document}->append_child ($comment);
516
517 ## Stay in the mode.
518 !!!next-token;
519 next B;
520 } elsif ($token->{type} == PI_TOKEN) {
521 my $pi = $self->{document}->create_processing_instruction
522 ($token->{target}, $token->{data});
523 $self->{document}->append_child ($pi);
524
525 ## Stay in the mode.
526 !!!next-token;
527 next B;
528 } elsif ($token->{type} == CHARACTER_TOKEN) {
529 if (not $self->{tainted} and
530 not $token->{has_reference} and
531 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
532 #
533 }
534
535 if (length $token->{data}) {
536 ## XML5: Ignore the token.
537
538 unless ($self->{tainted}) {
539 !!!parse-error (type => 'text outside of root element',
540 token => $token);
541 $self->{tainted} = 1;
542 }
543
544 $self->{document}->manakai_append_text ($token->{data});
545 }
546
547 ## Stay in the mode.
548 !!!next-token;
549 next B;
550 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
551 !!!parse-error (type => 'no root element',
552 token => $token);
553
554 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
555 ## Reprocess.
556 return;
557 } elsif ($token->{type} == END_TAG_TOKEN) {
558 !!!parse-error (type => 'unmatched end tag',
559 text => $token->{tag_name},
560 token => $token);
561 ## Ignore the token.
562
563 ## Stay in the mode.
564 !!!next-token;
565 next B;
566 } elsif ($token->{type} == DOCTYPE_TOKEN) {
567 !!!parse-error (type => 'in html:#doctype',
568 token => $token);
569 ## Ignore the token.
570
571 ## Stay in the mode.
572 !!!next-token;
573 next B;
574 } elsif ($token->{type} == ABORT_TOKEN) {
575 return;
576 } else {
577 die "$0: XML parser initial: Unknown token type $token->{type}";
578 }
579 } # B
580 } # _tree_before_root_element
581
582 sub _tree_in_element ($) {
583 my $self = shift;
584
585 B: while (1) {
586 if ($token->{type} == CHARACTER_TOKEN) {
587 $self->{open_elements}->[-1]->[0]->manakai_append_text ($token->{data});
588
589 ## Stay in the mode.
590 !!!next-token;
591 next B;
592 } elsif ($token->{type} == START_TAG_TOKEN) {
593 my $nsmap = {%{$self->{open_elements}->[-1]->[2]}};
594
595 my $attrs = $token->{attributes};
596 my $attrdefs = $self->{attrdef}->{$token->{tag_name}};
597 for my $attr_name (keys %{$attrdefs}) {
598 if ($attrs->{$attr_name}) {
599 $attrs->{$attr_name}->{type} = $attrdefs->{$attr_name}->{type} || 0;
600 if ($attrdefs->{$attr_name}->{tokenize}) {
601 $attrs->{$attr_name}->{value} =~ s/ +/ /g;
602 $attrs->{$attr_name}->{value} =~ s/\A //;
603 $attrs->{$attr_name}->{value} =~ s/ \z//;
604 }
605 } elsif (defined $attrdefs->{$attr_name}->{default}) {
606 $attrs->{$attr_name} = {
607 value => $attrdefs->{$attr_name}->{default},
608 type => $attrdefs->{$attr_name}->{type} || 0,
609 not_specified => 1,
610 line => $attrdefs->{$attr_name}->{line},
611 column => $attrdefs->{$attr_name}->{column},
612 index => 1 + keys %{$attrs},
613 };
614 }
615 }
616
617 for (keys %{$attrs}) {
618 if (/^xmlns:./s) {
619 my $prefix = substr $_, 6;
620 my $value = $attrs->{$_}->{value};
621 if ($prefix eq 'xml' or $prefix eq 'xmlns' or
622 $value eq q<http://www.w3.org/XML/1998/namespace> or
623 $value eq q<http://www.w3.org/2000/xmlns/>) {
624 ## NOTE: Error should be detected at the DOM layer.
625 #
626 } elsif (length $value) {
627 $nsmap->{$prefix} = $value;
628 } else {
629 delete $nsmap->{$prefix};
630 }
631 } elsif ($_ eq 'xmlns') {
632 my $value = $attrs->{$_}->{value};
633 if ($value eq q<http://www.w3.org/XML/1998/namespace> or
634 $value eq q<http://www.w3.org/2000/xmlns/>) {
635 ## NOTE: Error should be detected at the DOM layer.
636 #
637 } elsif (length $value) {
638 $nsmap->{''} = $value;
639 } else {
640 delete $nsmap->{''};
641 }
642 }
643 }
644
645 my $ns;
646 my ($prefix, $ln) = split /:/, $token->{tag_name}, 2;
647
648 if (defined $ln and $prefix ne '' and $ln ne '') { # prefixed
649 if (defined $nsmap->{$prefix}) {
650 $ns = $nsmap->{$prefix};
651 } else {
652 ## NOTE: Error should be detected at the DOM layer.
653 ($prefix, $ln) = (undef, $token->{tag_name});
654 }
655 } else {
656 $ns = $nsmap->{''} if $prefix ne '' and not defined $ln;
657 ($prefix, $ln) = (undef, $token->{tag_name});
658 }
659
660 my $el = $self->{document}->create_element_ns ($ns, [$prefix, $ln]);
661 $el->set_user_data (manakai_source_line => $token->{line});
662 $el->set_user_data (manakai_source_column => $token->{column});
663
664 my $has_attr;
665 for my $attr_name (sort {$attrs->{$a}->{index} <=> $attrs->{$b}->{index}}
666 keys %{$attrs}) {
667 my $ns;
668 my ($p, $l) = split /:/, $attr_name, 2;
669
670 if ($attr_name eq 'xmlns:xmlns') {
671 ($p, $l) = (undef, $attr_name);
672 } elsif (defined $l and $p ne '' and $l ne '') { # prefixed
673 if (defined $nsmap->{$p}) {
674 $ns = $nsmap->{$p};
675 } else {
676 ## NOTE: Error should be detected at the DOM-layer.
677 ($p, $l) = (undef, $attr_name);
678 }
679 } else {
680 if ($attr_name eq 'xmlns') {
681 $ns = $nsmap->{xmlns};
682 }
683 ($p, $l) = (undef, $attr_name);
684 }
685
686 if ($has_attr->{defined $ns ? $ns : ''}->{$l}) {
687 $ns = undef;
688 ($p, $l) = (undef, $attr_name);
689 } else {
690 $has_attr->{defined $ns ? $ns : ''}->{$l} = 1;
691 }
692
693 my $attr_t = $attrs->{$attr_name};
694 my $attr = $self->{document}->create_attribute_ns ($ns, [$p, $l]);
695 $attr->value ($attr_t->{value});
696 if (defined $attr_t->{type}) {
697 $attr->manakai_attribute_type ($attr_t->{type});
698 } elsif ($self->{document}->all_declarations_processed) {
699 $attr->manakai_attribute_type (0); # no value
700 } else {
701 $attr->manakai_attribute_type (11); # unknown
702 }
703 $attr->set_user_data (manakai_source_line => $attr_t->{line});
704 $attr->set_user_data (manakai_source_column => $attr_t->{column});
705 $el->set_attribute_node_ns ($attr);
706 $attr->specified (0) if $attr_t->{not_specified};
707 }
708
709 $self->{open_elements}->[-1]->[0]->append_child ($el);
710
711 if ($self->{self_closing}) {
712 !!!ack ('ack');
713 } else {
714 push @{$self->{open_elements}}, [$el, $token->{tag_name}, $nsmap];
715 }
716
717 ## Stay in the mode.
718 !!!next-token;
719 next B;
720 } elsif ($token->{type} == END_TAG_TOKEN) {
721 if ($token->{tag_name} eq '') {
722 ## Short end tag token.
723 pop @{$self->{open_elements}};
724 } elsif ($self->{open_elements}->[-1]->[1] eq $token->{tag_name}) {
725 pop @{$self->{open_elements}};
726 } else {
727 !!!parse-error (type => 'unmatched end tag',
728 text => $token->{tag_name},
729 token => $token);
730
731 ## Has an element in scope
732 INSCOPE: for my $i (reverse 0..$#{$self->{open_elements}}) {
733 if ($self->{open_elements}->[$i]->[1] eq $token->{tag_name}) {
734 splice @{$self->{open_elements}}, $i;
735 last INSCOPE;
736 }
737 } # INSCOPE
738 }
739
740 unless (@{$self->{open_elements}}) {
741 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
742 !!!next-token;
743 return;
744 } else {
745 ## Stay in the state.
746 !!!next-token;
747 redo B;
748 }
749 } elsif ($token->{type} == COMMENT_TOKEN) {
750 my $comment = $self->{document}->create_comment ($token->{data});
751 $self->{open_elements}->[-1]->[0]->append_child ($comment);
752
753 ## Stay in the mode.
754 !!!next-token;
755 next B;
756 } elsif ($token->{type} == PI_TOKEN) {
757 my $pi = $self->{document}->create_processing_instruction
758 ($token->{target}, $token->{data});
759 $self->{open_elements}->[-1]->[0]->append_child ($pi);
760
761 ## Stay in the mode.
762 !!!next-token;
763 next B;
764 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
765 !!!parse-error (type => 'in body:#eof',
766 token => $token);
767
768 $self->{insertion_mode} = AFTER_ROOT_ELEMENT_IM;
769 !!!next-token;
770 return;
771 } elsif ($token->{type} == DOCTYPE_TOKEN) {
772 !!!parse-error (type => 'in html:#doctype',
773 token => $token);
774 ## Ignore the token.
775
776 ## Stay in the mode.
777 !!!next-token;
778 next B;
779 } elsif ($token->{type} == ABORT_TOKEN) {
780 return;
781 } else {
782 die "$0: XML parser initial: Unknown token type $token->{type}";
783 }
784 } # B
785 } # _tree_in_element
786
787 sub _tree_after_root_element ($) {
788 my $self = shift;
789
790 B: while (1) {
791 if ($token->{type} == START_TAG_TOKEN) {
792 !!!parse-error (type => 'second root element',
793 token => $token);
794
795 ## XML5: Ignore the token.
796
797 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
798 ## Reprocess.
799 return;
800 } elsif ($token->{type} == COMMENT_TOKEN) {
801 my $comment = $self->{document}->create_comment ($token->{data});
802 $self->{document}->append_child ($comment);
803
804 ## Stay in the mode.
805 !!!next-token;
806 next B;
807 } elsif ($token->{type} == PI_TOKEN) {
808 my $pi = $self->{document}->create_processing_instruction
809 ($token->{target}, $token->{data});
810 $self->{document}->append_child ($pi);
811
812 ## Stay in the mode.
813 !!!next-token;
814 next B;
815 } elsif ($token->{type} == CHARACTER_TOKEN) {
816 if (not $self->{tainted} and
817 not $token->{has_reference} and
818 $token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) {
819 #
820 }
821
822 if (length $token->{data}) {
823 ## XML5: Ignore the token.
824
825 unless ($self->{tainted}) {
826 !!!parse-error (type => 'text outside of root element',
827 token => $token);
828 $self->{tainted} = 1;
829 }
830
831 $self->{document}->manakai_append_text ($token->{data});
832 }
833
834 ## Stay in the mode.
835 !!!next-token;
836 next B;
837 } elsif ($token->{type} == END_OF_FILE_TOKEN) {
838 ## Stop parsing.
839
840 ## TODO: implement "stop parsing".
841
842 $token = {type => ABORT_TOKEN};
843 return;
844 } elsif ($token->{type} == END_TAG_TOKEN) {
845 !!!parse-error (type => 'unmatched end tag',
846 text => $token->{tag_name},
847 token => $token);
848 ## Ignore the token.
849
850 ## Stay in the mode.
851 !!!next-token;
852 next B;
853 } elsif ($token->{type} == DOCTYPE_TOKEN) {
854 !!!parse-error (type => 'in html:#doctype',
855 token => $token);
856 ## Ignore the token.
857
858 ## Stay in the mode.
859 !!!next-token;
860 next B;
861 } elsif ($token->{type} == ABORT_TOKEN) {
862 return;
863 } else {
864 die "$0: XML parser initial: Unknown token type $token->{type}";
865 }
866 } # B
867 } # _tree_after_root_element
868
869 sub _tree_in_subset ($) {
870 my $self = shift;
871
872 B: while (1) {
873 if ($token->{type} == COMMENT_TOKEN) {
874 ## Ignore the token.
875
876 ## Stay in the state.
877 !!!next-token;
878 next B;
879 } elsif ($token->{type} == ELEMENT_TOKEN) {
880 unless ($self->{has_element_decl}->{$token->{name}}) {
881 my $node = $self->{doctype}->get_element_type_definition_node
882 ($token->{name});
883 unless ($node) {
884 $node = $self->{document}->create_element_type_definition
885 ($token->{name});
886 $self->{doctype}->set_element_type_definition_node ($node);
887 }
888
889 $node->set_user_data (manakai_source_line => $token->{line});
890 $node->set_user_data (manakai_source_column => $token->{column});
891
892 $node->content_model_text (join '', @{$token->{content}})
893 if $token->{content};
894 } else {
895 !!!parse-error (type => 'duplicate element decl', ## TODO: type
896 value => $token->{name},
897 token => $token);
898
899 ## TODO: $token->{content} syntax check.
900 }
901 $self->{has_element_decl}->{$token->{name}} = 1;
902
903 ## Stay in the mode.
904 !!!next-token;
905 next B;
906 } elsif ($token->{type} == ATTLIST_TOKEN) {
907 if ($self->{stop_processing}) {
908 ## TODO: syntax validation
909 } else {
910 my $ed = $self->{doctype}->get_element_type_definition_node
911 ($token->{name});
912 unless ($ed) {
913 $ed = $self->{document}->create_element_type_definition
914 ($token->{name});
915 $ed->set_user_data (manakai_source_line => $token->{line});
916 $ed->set_user_data (manakai_source_column => $token->{column});
917 $self->{doctype}->set_element_type_definition_node ($ed);
918 } elsif ($self->{has_attlist}->{$token->{name}}) {
919 !!!parse-error (type => 'duplicate attlist decl', ## TODO: type
920 value => $token->{name},
921 token => $token,
922 level => $self->{level}->{warn});
923 }
924 $self->{has_attlist}->{$token->{name}} = 1;
925
926 unless (@{$token->{attrdefs}}) {
927 !!!parse-error (type => 'empty attlist decl', ## TODO: type
928 value => $token->{name},
929 token => $token,
930 level => $self->{level}->{warn});
931 }
932
933 for my $at (@{$token->{attrdefs}}) {
934 unless ($ed->get_attribute_definition_node ($at->{name})) {
935 my $node = $self->{document}->create_attribute_definition
936 ($at->{name});
937 $node->set_user_data (manakai_source_line => $at->{line});
938 $node->set_user_data (manakai_source_column => $at->{column});
939
940 my $type = defined $at->{type} ? {
941 CDATA => 1, ID => 2, IDREF => 3, IDREFS => 4, ENTITY => 5,
942 ENTITIES => 6, NMTOKEN => 7, NMTOKENS => 8, NOTATION => 9,
943 }->{$at->{type}} : 10;
944 if (defined $type) {
945 $node->declared_type ($type);
946 } else {
947 !!!parse-error (type => 'unknown declared type', ## TODO: type
948 value => $at->{type},
949 token => $at);
950 }
951
952 push @{$node->allowed_tokens}, @{$at->{tokens}};
953
954 my $default = defined $at->{default} ? {
955 FIXED => 1, REQUIRED => 2, IMPLIED => 3,
956 }->{$at->{default}} : 4;
957 if (defined $default) {
958 $node->default_type ($default);
959 if (defined $at->{value}) {
960 if ($default == 1 or $default == 4) {
961 #
962 } elsif (length $at->{value}) {
963 !!!parse-error (type => 'default value not allowed', ## TODO: type
964 token => $at);
965 }
966 } else {
967 if ($default == 1 or $default == 4) {
968 !!!parse-error (type => 'default value not provided', ## TODO: type
969 token => $at);
970 }
971 }
972 } else {
973 !!!parse-error (type => 'unknown default type', ## TODO: type
974 value => $at->{default},
975 token => $at);
976 }
977
978 $type ||= 0;
979 my $tokenize = (2 <= $type and $type <= 10);
980
981 if (defined $at->{value}) {
982 if ($tokenize) {
983 $at->{value} =~ s/ +/ /g;
984 $at->{value} =~ s/\A //;
985 $at->{value} =~ s/ \z//;
986 }
987 $node->text_content ($at->{value});
988 }
989
990 $ed->set_attribute_definition_node ($node);
991
992 ## For tree construction
993 $self->{attrdef}->{$token->{name}}->{$at->{name}}
994 = {
995 type => $type,
996 tokenize => $tokenize,
997 default => (($default == 1 or $default == 4)
998 ? defined $at->{value} ? $at->{value} : ''
999 : undef),
1000 };
1001 } else {
1002 !!!parse-error (type => 'duplicate attrdef', ## TODO: type
1003 value => $at->{name},
1004 token => $at,
1005 level => $self->{level}->{warn});
1006
1007 ## TODO: syntax validation
1008 }
1009 } # $at
1010 }
1011
1012 ## Stay in the mode.
1013 !!!next-token;
1014 next B;
1015 } elsif ($token->{type} == GENERAL_ENTITY_TOKEN) {
1016 if ($self->{stop_processing}) {
1017 ## TODO: syntax validation
1018 } elsif ({
1019 amp => 1, apos => 1, quot => 1, lt => 1, gt => 1,
1020 }->{$token->{name}}) {
1021 if (not defined $token->{value} or
1022 $token->{value} !~
1023 {
1024 amp => qr/\A&#(?:x0*26|0*38);\z/,
1025 lt => qr/\A&#(?:x0*3[Cc]|0*60);\z/,
1026 gt => qr/\A(?>&#(?:x0*3[Ee]|0*62);|>)\z/,
1027 quot => qr/\A(?>&#(?:x0*22|0*34);|")\z/,
1028 apos => qr/\A(?>&#(?:x0*27|0*39);|')\z/,
1029 }->{$token->{name}}) {
1030 !!!parse-error (type => 'bad predefined entity decl', ## TODO: type
1031 value => $token->{name},
1032 token => $token);
1033 }
1034
1035 $self->{ge}->{$token->{name}.';'} = {name => $token->{name},
1036 value => {
1037 amp => '&',
1038 lt => '<',
1039 gt => '>',
1040 quot => '"',
1041 apos => "'",
1042 }->{$token->{name}},
1043 only_text => 1};
1044 } elsif (not $self->{ge}->{$token->{name}.';'}) {
1045 ## For parser.
1046 $self->{ge}->{$token->{name}.';'} = $token;
1047 if (defined $token->{value} and
1048 $token->{value} !~ /[&<]/) {
1049 $token->{only_text} = 1;
1050 }
1051
1052 ## For DOM.
1053 if (defined $token->{notation}) {
1054 my $node = $self->{document}->create_general_entity ($token->{name});
1055 $node->set_user_data (manakai_source_line => $token->{line});
1056 $node->set_user_data (manakai_source_column => $token->{column});
1057
1058 $node->public_id ($token->{pubid}); # may be undef
1059 $node->system_id ($token->{sysid}); # may be undef
1060 $node->notation_name ($token->{notation});
1061
1062 $self->{doctype}->set_general_entity_node ($node);
1063 } else {
1064 ## TODO: syntax validation
1065 }
1066 } else {
1067 !!!parse-error (type => 'duplicate general entity decl', ## TODO: type
1068 value => $token->{name},
1069 token => $token,
1070 level => $self->{level}->{warn});
1071
1072 ## TODO: syntax validation
1073 }
1074
1075 ## Stay in the mode.
1076 !!!next-token;
1077 next B;
1078 } elsif ($token->{type} == PARAMETER_ENTITY_TOKEN) {
1079 if ($self->{stop_processing}) {
1080 ## TODO: syntax validation
1081 } elsif (not $self->{pe}->{$token->{name}}) {
1082 ## For parser.
1083 $self->{pe}->{$token->{name}} = $token;
1084
1085 ## TODO: syntax validation
1086 } else {
1087 !!!parse-error (type => 'duplicate para entity decl', ## TODO: type
1088 value => $token->{name},
1089 token => $token,
1090 level => $self->{level}->{warn});
1091
1092 ## TODO: syntax validation
1093 }
1094
1095 ## Stay in the mode.
1096 !!!next-token;
1097 next B;
1098 } elsif ($token->{type} == NOTATION_TOKEN) {
1099 unless ($self->{doctype}->get_notation_node
1100 ($token->{name})) {
1101 my $node = $self->{document}->create_notation ($token->{name});
1102 $node->set_user_data (manakai_source_line => $token->{line});
1103 $node->set_user_data (manakai_source_column => $token->{column});
1104
1105 $node->public_id ($token->{pubid}); # may be undef
1106 $node->system_id ($token->{sysid}); # may be undef
1107
1108 $self->{doctype}->set_notation_node ($node);
1109 } else {
1110 !!!parse-error (type => 'duplicate notation decl', ## TODO: type
1111 value => $token->{name},
1112 token => $token);
1113
1114 ## TODO: syntax validation
1115 }
1116
1117 ## Stay in the mode.
1118 !!!next-token;
1119 next B;
1120 } elsif ($token->{type} == PI_TOKEN) {
1121 my $pi = $self->{document}->create_processing_instruction
1122 ($token->{target}, $token->{data});
1123 $self->{doctype}->append_child ($pi);
1124 ## TODO: line/col
1125
1126 ## Stay in the mode.
1127 !!!next-token;
1128 next B;
1129 } elsif ($token->{type} == END_OF_DOCTYPE_TOKEN) {
1130 $self->{insertion_mode} = BEFORE_ROOT_ELEMENT_IM;
1131 !!!next-token;
1132 return;
1133 } elsif ($token->{type} == ABORT_TOKEN) {
1134 return;
1135 } else {
1136 die "$0: XML parser subset im: Unknown token type $token->{type}";
1137 }
1138 } # B
1139
1140 } # _tree_in_subset
1141
1142 }
1143
1144 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24