#!/usr/bin/perl
use strict;
use utf8;
use CGI::Carp qw/fatalsToBrowser/;
BEGIN { require 'common.pl' }
require Encode;
my $max_result = 100;
sub decode_url ($) {
my $s = shift;
$s =~ tr/+/ /;
$s =~ s/%([0-9A-Fa-f]{2})/pack 'C', hex $1/ge;
return Encode::decode ('utf-8', $s);
} # decode_url
sub htescape ($) {
my $s = shift;
$s =~ s/&/&/g;
$s =~ s/</g;
$s =~ s/>/>/g;
$s =~ s/"/"/g;
return $s;
} # htescape
my $param = {};
for (split /[&;]/, $ENV{QUERY_STRING} || '') {
my ($name, $value) = split /=/, $_, 2;
$param->{decode_url ($name)} = decode_url ($value);
}
my $eword = htescape $param->{word};
my $pattern = quotemeta normalize $param->{word};
unless ($param->{cs}) {
$pattern =~ s/([A-Za-z])/'[' . uc ($1) . lc ($1) . ']'/ge;
}
my $suffix_pattern = {
ku => qr/(?>[かこきいっくけ])/,
su => qr/(?>[さそしすせ])/,
tsu => qr/(?>[たとちっつて])/,
nu => qr/(?>[なのにんぬね])/,
mu => qr/(?>[まもみんむめ])/,
ru => qr/(?>[らろりっるれ])/,
u => qr/(?>[わおいっうえ])/,
gu => qr/(?>[がごぎいぐげ])/,
bu => qr/(?>[ばぼびんぶべ])/,
ichidan => qr/(?>[るれろよ])?/,
kuru => qr/(?>[るれい])?/,
suru => qr/(?>す[るれ]|しろ?|せよ?|さ)?/,
i => qr/(?>か[ろっ]|く|い|けれ|う)?/, ## BUG: ありがたい -> ありがとう
da => qr/(?>だ[ろっ]?|で|に|なら?)?/,
dasuru => qr/(?>だ[ろっ]?|で|に|なら?|す[るれ]|しろ?|せよ?|さ)?/,
}->{$param->{suffix}} || qr//;
$pattern =~ s/$suffix_pattern$//;
$pattern .= $suffix_pattern;
$pattern = '\b' . $pattern . '\b' if $param->{aw};
print qq[Content-Type: text/html ; charset=utf-8
Search result for "$eword"
Search result for "$eword" ];
print_input_form ();
unless ('' =~ /$pattern/) {
print q[];
my $has_match;
for_each_data_file (sub ($) {
my $data_file_name = shift;
load_data_file ($data_file_name, my $exact_data = {}, my $pattern_data = {});
$pattern_data = unescape_patterns ($pattern_data);
$has_match |= print_matches ($data_file_name, $exact_data, $pattern_data);
});
unless ($has_match) {
print q[No match found.
];
} else {
print q[];
print_input_form ();
}
}
sub unescape_patterns ($) {
my $pattern_data = shift;
my $new_data = {};
for (keys %$pattern_data) {
my $w = $_;
my $v = $_;
$v =~ s/\(\.\+\)/*/g;
$v =~ s/\\([\s\S])/$1/g;
$new_data->{$v} = $pattern_data->{$w};
}
return $new_data;
} # unescape_patterns
sub print_matches ($$$) {
my ($file_name, $exact_data, $pattern_data) = @_;
my $file_id = $file_name;
if ($file_name =~ /([0-9A-Za-z-]+)\.dat$/) {
$file_id = $1;
}
my $has_match;
my $r = qq[];
my $result = 0;
my $added = {};
my %en2ja = (%$exact_data, %$pattern_data);
for (sort {$a cmp $b} keys %en2ja) {
if (/$pattern/) {
$has_match = 1;
$added->{$_} = 1;
$r .= get_match_text ($_, $en2ja{$_}, $pattern);
$r .= q[...] and last if ++$result == $max_result;
}
}
$result = 0;
my %ja2en = reverse %en2ja;
for (sort {$a cmp $b} keys %ja2en) {
if (/$pattern/) {
next if $added->{$ja2en{$_}};
$has_match = 1;
$r .= get_match_text ($ja2en{$_}, $_, $pattern);
$r .= q[ ...] and last if ++$result == $max_result;
}
}
$r .= q[
];
print $r if $has_match;
return $has_match;
} # print_matches
sub get_match_text ($$) {
my ($en, $ja) = @_;
## NOTE: Marking will not work well if it contains &, <, >, or ", or
## the pattern matches with charrefs, e.g. "t" (part of < and ").
my $r = q[];
my $v = htescape ($en);
$v =~ s[($pattern)][$1 ]g;
$v =~ s[(<[\s\S]+?>)][$1 ]g;
$v =~ s[(&[#0-9A-Za-z]+;)][$1 ]g;
$v =~ s[\*][* ]g;
$r .= $v;
$r .= q[ ];
my $v = htescape ($ja);
$v =~ s[($pattern)][$1 ]g;
$v =~ s[(<[\s\S]+?>)][$1 ]g;
$v =~ s[(&[#0-9A-Za-z]+;)][$1 ]g;
$v =~ s{(\[\[[^\[\]]+\]\])}[$1 ]g;
$v =~ s[(\$[0-9]+)][$1 ]g;
$r .= $v;
return $r;
} # get_match_text
sub print_input_form () {
print qq[];
} # print_input_form