/[suikacvs]/webroot/swe/lib/SWE/Object/Document.pm
Suika

Contents of /webroot/swe/lib/SWE/Object/Document.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (hide annotations) (download)
Sun Jul 12 10:37:45 2009 UTC (15 years, 9 months ago) by wakaba
Branch: MAIN
Changes since 1.1: +54 -0 lines
++ swe/lib/SWE/Object/ChangeLog	12 Jul 2009 10:37:22 -0000
	* Document.pm (update_tfidf): New method copied from main.pl.

2009-07-12  Wakaba  <wakaba@suika.fam.cx>

++ swe/lib/suikawiki/ChangeLog	12 Jul 2009 10:37:39 -0000
	* main.pl (update_tfidf): Moved to another module.

2009-07-12  Wakaba  <wakaba@suika.fam.cx>

1 wakaba 1.1 package SWE::Object::Document;
2     use strict;
3     use warnings;
4    
5     sub new ($%) {
6     my $class = shift;
7     my $self = bless {@_}, $class;
8    
9     return $self;
10     }
11    
12     sub db { $_[0]->{db} }
13    
14     sub id { $_[0]->{id} }
15    
16     sub associate_names ($$%) {
17     my ($self, $names, %args) = @_;
18    
19     ## NOTE: names_lock MUST be executed before the invocation.
20    
21     my $id = $self->id;
22     my $time = $args{time} || time;
23     my $sw3_pages = $self->{sw3_pages}; ## TODO: ...
24    
25     my $vc = $self->db->vc;
26    
27     my $name_prop_db = $self->{name_prop_db}; ## TODO: ...
28     local $name_prop_db->{version_control} = $vc;
29    
30     my $name_history_db = $self->db->name_history;
31     local $name_history_db->{version_control} = $vc;
32    
33     for my $name (keys %$names) {
34     my $name_props = $name_prop_db->get_data ($name);
35     unless (defined $name_props) {
36     my $sw3id = $sw3_pages->get_data ($name);
37     main::convert_sw3_page ($sw3id => $name); ## TODO: ...
38    
39     $name_props = $name_prop_db->get_data ($name);
40     unless (defined $name_props) {
41     $name_history_db->append_data ($name => [$time, 'c']);
42     }
43     }
44    
45     push @{$name_props->{id} ||= []}, $id;
46     $name_props->{name} = $name;
47     $name_prop_db->set_data ($name => $name_props);
48    
49     $name_history_db->append_data ($name => [$time, 'a', $id]);
50     }
51    
52     my $user = $args{user} || '(anon)';
53     $vc->commit_changes ("id=$id created by $user");
54     } # associate_names
55    
56 wakaba 1.2 sub update_tfidf ($$) {
57     my ($self, $doc) = @_; ## TODO: $doc should not be an argument
58    
59     ## It is REQUIRED to lock the $id before the invocation of this
60     ## method to keep the consistency of tfidf data for the $id.
61    
62     my $id = $self->id;
63    
64     my $tfidf_db = $self->db->id_tfidf;
65    
66     require SWE::Data::FeatureVector;
67    
68     my $deleted_terms = SWE::Data::FeatureVector->parse_stringref
69     ($tfidf_db->get_data ($id))->as_key_hashref;
70    
71     my $tc = $doc->document_element->text_content;
72    
73     ## TODO: use element semantics...
74    
75     my $orig_tfs = {};
76     my $all_terms = 0;
77     main::for_unique_words ($tc => sub {
78     $orig_tfs->{$_[0]} = $_[1];
79     $all_terms += $_[1];
80     }); ## TODO: XXX
81    
82     my $names_index_db = $self->db->name_inverted_index;
83     $names_index_db->lock;
84    
85     my $idgen = $self->db->id;
86     my $doc_number = $idgen->get_last_id;
87    
88     my $terms = SWE::Data::FeatureVector->new;
89     for my $term (keys %$orig_tfs) {
90     my $n_tf = $orig_tfs->{$term} / $all_terms;
91    
92     my $df = $names_index_db->get_count ($term);
93     my $idf = log ($doc_number / ($df + 1));
94    
95     my $tfidf = $n_tf * $idf;
96    
97     $terms->set_tfidf ($term, $tfidf);
98     $names_index_db->add_data ($term => $id => $tfidf);
99    
100     delete $deleted_terms->{$term};
101     }
102    
103     for my $term (keys %$deleted_terms) {
104     $names_index_db->delete_data ($term, $id);
105     }
106    
107     $tfidf_db->set_data ($id => \( $terms->stringify ));
108     } # update_tfidf
109    
110 wakaba 1.1 1;

admin@suikawiki.org
ViewVC Help
Powered by ViewVC 1.1.24