/[suikacvs]/markup/html/whatpm/Whatpm/RDFXML.pm
Suika

Contents of /markup/html/whatpm/Whatpm/RDFXML.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (show annotations) (download)
Fri Mar 21 09:44:57 2008 UTC (16 years, 7 months ago) by wakaba
Branch: MAIN
Changes since 1.3: +47 -23 lines
++ whatpm/Whatpm/ChangeLog	21 Mar 2008 09:44:45 -0000
	* RDFXML.pm: bnodeid implemented.  Relative references
	are now resolved.

2008-03-21  Wakaba  <wakaba@suika.fam.cx>

1 package Whatpm::RDFXML;
2 use strict;
3
4 ## NOTE: EntityReference nodes are not supported except that
5 ## unexpected entity reference are simply ignored. (In fact
6 ## all EntityRefernce nodes are ignored.)
7
8 ## TODO: Add a callback function invoked for every element
9 ## when XMLCC is implemented in WDCC.
10
11 ## ISSUE: <html:nest/> in RDF subtree?
12
13 ## ISSUE: PIs in RDF subtree should be validated?
14
15 my $RDF_URI = q<http://www.w3.org/1999/02/22-rdf-syntax-ns#>;
16
17 sub new ($) {
18 my $self = bless {fact_level => 'm', grammer_level => 'm'}, shift;
19 $self->{next_id} = 0;
20 $self->{onerror} = sub {
21 my %opt = @_;
22 warn $opt{type}, "\n";
23 };
24 $self->{ontriple} = sub {
25 my %opt = @_;
26 my $dump_resource = sub {
27 my $resource = shift;
28 if (defined $resource->{uri}) {
29 return '<' . $resource->{uri} . '>';
30 } elsif (defined $resource->{bnodeid}) {
31 return '_:' . $resource->{bnodeid};
32 } elsif ($resource->{nodes}) {
33 return '"' . join ('', map {$_->inner_html} @{$resource->{nodes}}) .
34 '"^^<' . $resource->{datatype} . '>';
35 } elsif (defined $resource->{value}) {
36 return '"' . $resource->{value} . '"' .
37 (defined $resource->{datatype}
38 ? '^^<' . $resource->{datatype} . '>'
39 : '@' . $resource->{language});
40 } else {
41 return '??';
42 }
43 };
44 print STDERR $dump_resource->($opt{subject}) . ' ';
45 print STDERR $dump_resource->($opt{predicate}) . ' ';
46 print STDERR $dump_resource->($opt{object}) . "\n";
47 if ($dump_resource->{id}) {
48 print STDERR $dump_resource->($dump_resource->{id}) . ' ';
49 print STDERR $dump_resource->({uri => $RDF_URI . 'subject'}) . ' ';
50 print STDERR $dump_resource->($opt{subject}) . "\n";
51 print STDERR $dump_resource->($dump_resource->{id}) . ' ';
52 print STDERR $dump_resource->({uri => $RDF_URI . 'predicate'}) . ' ';
53 print STDERR $dump_resource->($opt{predicate}) . "\n";
54 print STDERR $dump_resource->($dump_resource->{id}) . ' ';
55 print STDERR $dump_resource->({uri => $RDF_URI . 'object'}) . ' ';
56 print STDERR $dump_resource->($opt{object}) . "\n";
57 print STDERR $dump_resource->($dump_resource->{id}) . ' ';
58 print STDERR $dump_resource->({uri => $RDF_URI . 'type'}) . ' ';
59 print STDERR $dump_resource->({uri => $RDF_URI . 'Statement'}) . "\n";
60 }
61 };
62 return $self;
63 } # new
64
65 sub convert_document ($$) {
66 my $self = shift;
67 my $node = shift; # Document
68
69 ## ISSUE: An RDF/XML document, either |doc| or |nodeElement|
70 ## is allowed as a starting production. However, |nodeElement|
71 ## is not a Root Event.
72
73 my $has_element;
74
75 for my $cn (@{$node->child_nodes}) {
76 if ($cn->node_type == $cn->ELEMENT_NODE) {
77 unless ($has_element) {
78 if ($cn->manakai_expanded_uri eq $RDF_URI . q<RDF>) {
79 $self->convert_rdf_element ($cn);
80 } else {
81 $self->convert_rdf_node_element ($cn);
82 }
83 $has_element = 1;
84 } else {
85 $self->{onerror}->(type => 'second node element',
86 level => $self->{grammer_level},
87 node => $cn);
88 }
89 } elsif ($cn->node_type == $cn->TEXT_NODE or
90 $cn->node_type == $cn->CDATA_SECTION_NODE) {
91 $self->{onerror}->(type => 'character not allowed',
92 level => $self->{grammer_level},
93 node => $cn);
94 }
95 }
96 } # convert_document
97
98 my $check_rdf_namespace = sub {
99 my $self = shift;
100 my $node = shift;
101 my $node_nsuri = $node->namespace_uri;
102 return unless defined $node_nsuri;
103 if (substr ($node_nsuri, 0, length $RDF_URI) eq $RDF_URI and
104 length $RDF_URI < length $node_nsuri) {
105 $self->{onerror}->(type => 'bad rdf namespace',
106 level => $self->{fact_level}, # Section 5.1
107 node => $node);
108 }
109 }; # $check_rdf_namespace
110
111 sub convert_rdf_element ($$) {
112 my ($self, $node) = @_;
113
114 $check_rdf_namespace->($self, $node);
115
116 # |RDF|
117
118 for my $attr (@{$node->attributes}) {
119 my $prefix = $attr->prefix;
120 if (defined $prefix) {
121 next if $prefix =~ /^[Xx][Mm][Ll]/;
122 } else {
123 next if $attr->manakai_local_name =~ /^[Xx][Mm][Ll]/;
124 }
125
126 $check_rdf_namespace->($self, $attr);
127 $self->{onerror}->(type => 'attribute not allowed',
128 level => $self->{grammer_level},
129 node => $attr);
130 }
131
132 # |nodeElementList|
133 for my $cn (@{$node->child_nodes}) {
134 if ($cn->node_type == $cn->ELEMENT_NODE) {
135 $self->convert_node_element ($cn);
136 } elsif ($cn->node_type == $cn->TEXT_NODE or
137 $cn->node_type == $cn->CDATA_SECTION_NODE) {
138 if ($cn->data =~ /[^\x09\x0A\x0D\x20]/) {
139 $self->{onerror}->(type => 'character not allowed',
140 level => $self->{grammer_level},
141 node => $cn);
142 }
143 }
144 }
145 } # convert_rdf_element
146
147 my %coreSyntaxTerms = (
148 $RDF_URI . 'RDF' => 1,
149 $RDF_URI . 'ID' => 1,
150 $RDF_URI . 'about' => 1,
151 $RDF_URI . 'parseType' => 1,
152 $RDF_URI . 'resource' => 1,
153 $RDF_URI . 'nodeID' => 1,
154 $RDF_URI . 'datatype' => 1,
155 );
156
157 my %oldTerms = (
158 $RDF_URI . 'aboutEach' => 1,
159 $RDF_URI . 'aboutEachPrefix' => 1,
160 $RDF_URI . 'bagID' => 1,
161 );
162
163 require Message::DOM::DOMImplementation;
164 my $resolve = sub {
165 return Message::DOM::DOMImplementation->create_uri_reference ($_[0])
166 ->get_absolute_reference ($_[1]->base_uri)
167 ->uri_reference;
168
169 ## TODO: Check latest xml:base and IRI spec...
170 ## (non IRI/URI chars should be percent-encoded before resolve?)
171 }; # $resolve
172
173 my $generate_bnodeid = sub {
174 return 'g'.$_[0]->{next_id}++;
175 }; # $generate_bnodeid
176
177 my $get_bnodeid = sub {
178 return 'b'.$_[0];
179 }; # $get_bnodeid
180
181 sub convert_node_element ($$) {
182 my ($self, $node) = @_;
183
184 $check_rdf_namespace->($self, $node);
185
186 # |nodeElement|
187
188 my $xuri = $node->manakai_expanded_uri;
189
190 if ({
191 %coreSyntaxTerms,
192 $RDF_URI . 'li' => 1,
193 %oldTerms,
194 }->{$xuri}) {
195 $self->{onerror}->(type => 'element not allowed',
196 level => $self->{grammer_level},
197 node => $node);
198
199 ## TODO: W3C RDF Validator: Continue validation, but triples that would
200 ## be generated from the subtree are ignored.
201 }
202
203 my $subject;
204 my $type_attr;
205 my @prop_attr;
206
207 for my $attr (@{$node->attributes}) {
208 my $prefix = $attr->prefix;
209 if (defined $prefix) {
210 next if $prefix =~ /^[Xx][Mm][Ll]/;
211 } else {
212 next if $attr->manakai_local_name =~ /^[Xx][Mm][Ll]/;
213 }
214
215 $check_rdf_namespace->($self, $attr);
216
217 my $attr_xuri = $attr->manakai_expanded_uri;
218 if ($attr_xuri eq $RDF_URI . 'ID') {
219 unless (defined $subject) {
220 $subject = {uri => $resolve->('#' . $attr->value, $attr)};
221 } else {
222 ## TODO: Ignore triple as W3C RDF Validator does
223 }
224 } elsif ($attr_xuri eq $RDF_URI . 'nodeID') {
225 unless (defined $subject) {
226 $subject = {bnodeid => $get_bnodeid->($attr->value)};
227 } else {
228 ## TODO: Ignore triple as W3C RDF Validator does
229 }
230 } elsif ($attr_xuri eq $RDF_URI . 'about') {
231 unless (defined $subject) {
232 $subject = {uri => $resolve->($attr->value, $attr)};
233 } else {
234 ## TODO: Ignore triple as W3C RDF Validator does
235 }
236 } elsif ($attr_xuri eq $RDF_URI . 'type') {
237 $type_attr = $attr;
238 } elsif ({
239 %coreSyntaxTerms,
240 $RDF_URI . 'li' => 1,
241 $RDF_URI . 'Description' => 1,
242 %oldTerms,
243 }->{$attr_xuri}) {
244 $self->{onerror}->(type => 'attribute not allowed',
245 level => $self->{grammer_level},
246 node => $attr);
247
248 ## TODO: W3C RDF Validator: Ignore triples
249 } else {
250 push @prop_attr, $attr;
251 }
252 }
253
254 unless (defined $subject) {
255 $subject = {bnodeid => $generate_bnodeid->($self)};
256 }
257
258 if ($xuri ne $RDF_URI . 'Description') {
259 $self->{ontriple}->(subject => $subject,
260 predicate => {uri => $RDF_URI . 'type'},
261 object => {uri => $xuri},
262 node => $node);
263 }
264
265 if ($type_attr) {
266 $self->{ontriple}->(subject => $subject,
267 predicate => {uri => $RDF_URI . 'type'},
268 object => {uri => $resolve->($type_attr->value,
269 $type_attr)},
270 node => $type_attr);
271 }
272
273 for my $attr (@prop_attr) {
274 $self->{ontriple}->(subject => $subject,
275 predicate => {uri => $attr->manakai_expanded_uri},
276 object => {value => $attr->value}, ## TODO: language
277 node => $attr);
278 ## TODO: SHOULD in NFC
279 }
280
281 # |propertyEltList|
282
283 my $li_counter = 1;
284 for my $cn (@{$node->child_nodes}) {
285 my $cn_type = $cn->node_type;
286 if ($cn_type == $cn->ELEMENT_NODE) {
287 $self->convert_property_element ($cn, li_counter => \$li_counter,
288 subject => $subject);
289 } elsif ($cn_type == $cn->TEXT_NODE or
290 $cn_type == $cn->CDATA_SECTION_NODE) {
291 if ($cn->data =~ /[^\x09\x0A\x0D\x20]/) {
292 $self->{onerror}->(type => 'character not allowed',
293 level => $self->{grammer_level},
294 node => $cn);
295 }
296 }
297 }
298
299 return $subject;
300 } # convert_node_element
301
302 my $get_id_resource = sub {
303 return $_[0] ? {uri => $resolve->('#' . $_[0]->value, $_[0])} : undef;
304 }; # $get_id_resource
305
306 sub convert_property_element ($$%) {
307 my ($self, $node, %opt) = @_;
308
309 $check_rdf_namespace->($self, $node);
310
311 # |propertyElt|
312
313 my $xuri = $node->manakai_expanded_uri;
314 if ($xuri eq $RDF_URI . 'li') {
315 $xuri = $RDF_URI . '_' . ${$opt{li_counter}}++;
316 }
317
318 if ({
319 %coreSyntaxTerms,
320 $RDF_URI . 'Description' => 1,
321 %oldTerms,
322 }->{$xuri}) {
323 $self->{onerror}->(type => 'element not allowed',
324 level => $self->{grammer_level},
325 node => $node);
326 ## TODO: RDF Validator?
327 }
328
329 my $id_attr;
330 my $dt_attr;
331 my $parse_attr;
332 my $nodeid_attr;
333 my $resource_attr;
334 my @prop_attr;
335 for my $attr (@{$node->attributes}) {
336 my $prefix = $attr->prefix;
337 if (defined $prefix) {
338 next if $prefix =~ /^[Xx][Mm][Ll]/;
339 } else {
340 next if $attr->manakai_local_name =~ /^[Xx][Mm][Ll]/;
341 }
342
343 $check_rdf_namespace->($self, $attr);
344
345 my $attr_xuri = $attr->manakai_expanded_uri;
346 if ($attr_xuri eq $RDF_URI . 'ID') {
347 $id_attr = $attr;
348 } elsif ($attr_xuri eq $RDF_URI . 'datatype') {
349 $dt_attr = $attr;
350 } elsif ($attr_xuri eq $RDF_URI . 'parseType') {
351 $parse_attr = $attr;
352 } elsif ($attr_xuri eq $RDF_URI . 'resource') {
353 $resource_attr = $attr;
354 } elsif ($attr_xuri eq $RDF_URI . 'nodeID') {
355 $nodeid_attr = $attr;
356 } elsif ({
357 %coreSyntaxTerms,
358 $RDF_URI . 'li' => 1,
359 $RDF_URI . 'Description' => 1,
360 %oldTerms,
361 }->{$attr_xuri}) {
362 $self->{onerror}->(type => 'attribute not allowed',
363 level => $self->{grammer_level},
364 node => $attr);
365 ## TODO: RDF Validator?
366 } else {
367 push @prop_attr, $attr;
368 }
369 }
370
371 my $parse = $parse_attr ? $parse_attr->value : '';
372 if ($parse eq 'Resource') {
373 # |parseTypeResourcePropertyElt|
374
375 for my $attr ($resource_attr, $nodeid_attr, $dt_attr) {
376 next unless $attr;
377 $self->{onerror}->(type => 'attribute not allowed',
378 level => $self->{grammer_level},
379 node => $attr);
380 ## TODO: RDF Validator?
381 }
382
383 my $object = {bnodeid => $generate_bnodeid->($self)};
384 $self->{ontriple}->(subject => $opt{subject},
385 predicate => {uri => $xuri},
386 object => $object,
387 node => $node,
388 id => $get_id_resource->($id_attr));
389
390 ## As if nodeElement
391
392 # |propertyEltList|
393
394 my $li_counter = 1;
395 for my $cn (@{$node->child_nodes}) {
396 my $cn_type = $cn->node_type;
397 if ($cn_type == $cn->ELEMENT_NODE) {
398 $self->convert_property_element ($cn, li_counter => \$li_counter,
399 subject => $object);
400 } elsif ($cn_type == $cn->TEXT_NODE or
401 $cn_type == $cn->CDATA_SECTION_NODE) {
402 if ($cn->data =~ /[^\x09\x0A\x0D\x20]/) {
403 $self->{onerror}->(type => 'character not allowed',
404 level => $self->{grammer_level},
405 node => $cn);
406 }
407 }
408 }
409 } elsif ($parse eq 'Collection') {
410 # |parseTypeCollectionPropertyElt|
411
412 for my $attr ($resource_attr, $nodeid_attr, $dt_attr) {
413 next unless $attr;
414 $self->{onerror}->(type => 'attribute not allowed',
415 level => $self->{grammer_level},
416 node => $attr);
417 ## TODO: RDF Validator?
418 }
419
420 # |nodeElementList|
421 my @resource;
422 for my $cn (@{$node->child_nodes}) {
423 if ($cn->node_type == $cn->ELEMENT_NODE) {
424 push @resource, [$self->convert_node_element ($cn),
425 {bnodeid => $generate_bnodeid->($self)},
426 $cn];
427 } elsif ($cn->node_type == $cn->TEXT_NODE or
428 $cn->node_type == $cn->CDATA_SECTION_NODE) {
429 if ($cn->data =~ /[^\x09\x0A\x0D\x20]/) {
430 $self->{onerror}->(type => 'character not allowed',
431 level => $self->{grammer_level},
432 node => $cn);
433 }
434 }
435 }
436
437 if (@resource) {
438 $self->{ontriple}->(subject => $opt{subject},
439 predicate => {uri => $xuri},
440 object => $resource[0]->[1],
441 node => $node);
442 } else {
443 $self->{ontriple}->(subject => $opt{subject},
444 predicate => {uri => $xuri},
445 object => {uri => $RDF_URI . 'nil'},
446 node => $node,
447 id => $get_id_resource->($id_attr));
448 }
449
450 while (@resource) {
451 my $resource = shift @resource;
452 $self->{ontriple}->(subject => $resource->[1],
453 predicate => {uri => $RDF_URI . 'first'},
454 object => $resource->[0],
455 node => $resource->[2]);
456 if (@resource) {
457 $self->{ontriple}->(subject => $resource->[1],
458 predicate => {uri => $RDF_URI . 'rest'},
459 object => $resource[0]->[1],
460 node => $resource->[2]);
461 } else {
462 $self->{ontriple}->(subject => $resource->[1],
463 predicate => {uri => $RDF_URI . 'rest'},
464 object => {uri => $RDF_URI . 'nil'},
465 node => $resource->[2]);
466 }
467 }
468 } elsif ($parse_attr) {
469 # |parseTypeLiteralPropertyElt|
470 # |parseTypeOtherPropertyElt| ## TODO: What RDF Validator does?
471
472 for my $attr ($resource_attr, $nodeid_attr, $dt_attr) {
473 next unless $attr;
474 $self->{onerror}->(type => 'attribute not allowed',
475 level => $self->{grammer_level},
476 node => $attr);
477 ## TODO: RDF Validator?
478 }
479
480 my $value = [@{$node->child_nodes}];
481 ## TODO: Callback for validation
482 ## TODO: Serialized form SHOULD be in NFC.
483
484 $self->{ontriple}->(subject => $opt{subject},
485 predicate => {uri => $xuri},
486 object => {nodes => $value,
487 datatype => $RDF_URI . 'XMLLiteral'},
488 node => $node,
489 id => $get_id_resource->($id_attr));
490 } else {
491 my $mode = 'unknown';
492
493 if ($dt_attr) {
494 $mode = 'literal'; # |literalPropertyElt|
495 ## TODO: What RDF Validator does for |< rdf:datatype><el/></>|?
496 }
497 ## TODO: What RDF Validator does for |< prop-attr><non-empty/></>|?
498
499 my $node_element;
500 my $text = '';
501 for my $cn (@{$node->child_nodes}) {
502 my $cn_type = $cn->node_type;
503 if ($cn_type == $cn->ELEMENT_NODE) {
504 unless ($node_element) {
505 $node_element = $cn;
506 if ({
507 resource => 1, unknown => 1, 'literal-or-resource' => 1,
508 }->{$mode}) {
509 $mode = 'resource';
510 } else {
511 $self->{onerror}->(type => 'element not allowed',
512 level => $self->{grammer_level},
513 node => $cn);
514 ## TODO: RDF Validator?
515 }
516 } else {
517 ## TODO: What RDF Validator does?
518 $self->{onerror}->(type => 'second node element',
519 level => $self->{grammer_level},
520 node => $cn);
521 }
522 } elsif ($cn_type == $cn->TEXT_NODE or
523 $cn_type == $cn->CDATA_SECTION_NODE) {
524 my $data = $cn->data;
525 $text .= $data;
526 if ($data =~ /[^\x09\x0A\x0D\x20]/) {
527 if ({
528 literal => 1, unknown => 1, 'literal-or-resource' => 1,
529 }->{$mode}) {
530 $mode = 'literal';
531 } else {
532 $self->{onerror}->(type => 'character not allowed',
533 level => $self->{grammer_level},
534 node => $cn);
535 ## TODO: RDF Validator?
536 }
537 } else {
538 if ($mode eq 'unknown') {
539 $mode = 'literal-or-resource';
540 } else {
541 #
542 }
543 }
544 }
545 }
546
547 if ($mode eq 'resource') {
548 # |resourcePropertyElt|
549
550 for my $attr (@prop_attr, $resource_attr, $nodeid_attr, $dt_attr) {
551 next unless $attr;
552 $self->{onerror}->(type => 'attribute not allowed',
553 level => $self->{grammer_level},
554 node => $attr);
555 ## TODO: RDF Validator?
556 }
557
558 my $object = $self->convert_node_element ($node_element);
559
560 $self->{ontriple}->(subject => $opt{subject},
561 predicate => {uri => $xuri},
562 object => $object,
563 node => $node,
564 id => $get_id_resource->($id_attr));
565 } elsif ($mode eq 'literal' or $mode eq 'literal-or-resource') {
566 # |literalPropertyElt|
567
568 for my $attr (@prop_attr, $resource_attr, $nodeid_attr) {
569 next unless $attr;
570 $self->{onerror}->(type => 'attribute not allowed',
571 level => $self->{grammer_level},
572 node => $attr);
573 ## TODO: RDF Validator?
574 }
575
576 ## TODO: $text SHOULD be in NFC
577
578 if ($dt_attr) {
579 $self->{ontriple}->(subject => $opt{subject},
580 predicate => {uri => $xuri},
581 object => {value => $text,
582 datatype => $dt_attr->value},
583 node => $node,
584 id => $get_id_resource->($id_attr));
585 } else {
586 $self->{ontriple}->(subject => $opt{subject},
587 predicate => {uri => $xuri},
588 object => {value => $text,
589 ## TODO: language
590 },
591 node => $node,
592 id => $get_id_resource->($id_attr));
593 }
594 } else {
595 ## |emptyPropertyElt|
596
597 for my $attr ($dt_attr) {
598 next unless $attr;
599 $self->{onerror}->(type => 'attribute not allowed',
600 level => $self->{grammer_level},
601 node => $attr);
602 ## TODO: RDF Validator?
603 }
604
605 if (not $resource_attr and not $nodeid_attr and not @prop_attr) {
606 $self->{ontriple}->(subject => $opt{subject},
607 predicate => {uri => $xuri},
608 object => {value => '',
609 ## TODO: language
610 },
611 node => $node,
612 id => $get_id_resource->($id_attr));
613 } else {
614 my $object;
615 if ($resource_attr) {
616 $object = {uri => $resolve->($resource_attr->value, $resource_attr)};
617 } elsif ($nodeid_attr) {
618 $object = {bnodeid => $get_bnodeid->($nodeid_attr->value)};
619 } else {
620 $object = {bnodeid => $generate_bnodeid->($self)};
621 }
622
623 for my $attr (@prop_attr) {
624 my $attr_xuri = $attr->manakai_expanded_uri;
625 if ($attr_xuri eq $RDF_URI . 'type') {
626 $self->{ontriple}->(subject => $object,
627 predicate => {uri => $attr_xuri},
628 object => $resolve->($attr->value, $attr),
629 node => $attr);
630 } else {
631 ## TODO: SHOULD be in NFC
632 $self->{ontriple}->(subject => $object,
633 predicate => {uri => $attr_xuri},
634 object => {value => $attr->value,
635 ## TODO: lang
636 },
637 node => $attr);
638 }
639 }
640
641 $self->{ontriple}->(subject => $opt{subject},
642 predicate => {uri => $xuri},
643 object => $object,
644 node => $node,
645 id => $get_id_resource->($id_attr));
646 }
647 }
648 }
649 } # convert_property_element
650
651 1;
652

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24