html5/spec-ja/find.cgi

#!/usr/bin/perl
use strict;
use utf8;
use CGI::Carp qw/fatalsToBrowser/;

BEGIN { require 'common.pl' }

require Encode;

my $max_result = 100;

sub decode_url ($) {
  my $s = shift;
  $s =~ tr/+/ /;
  $s =~ s/%([0-9A-Fa-f]{2})/pack 'C', hex $1/ge;
  return Encode::decode ('utf-8', $s);
} # decode_url

sub htescape ($) {
  my $s = shift;
  $s =~ s/&/&amp;/g;
  $s =~ s/</&lt;/g;
  $s =~ s/>/&gt;/g;
  $s =~ s/"/&quot;/g;
  return $s;
} # htescape

my $param = {};
for (split /[&;]/, $ENV{QUERY_STRING} || '') {
  my ($name, $value) = split /=/, $_, 2;
  $param->{decode_url ($name)} = decode_url ($value);
}

my $eword = htescape $param->{word};

my $pattern = quotemeta normalize $param->{word};
$pattern =~ s/\\-/[- ]/g;
unless ($param->{cs}) {
  $pattern =~ s/([A-Za-z])/'[' . uc ($1) . lc ($1) . ']'/ge;
}
my $suffix_pattern = {
  ku => qr/(?>[かこきいっくけ])/,
  su => qr/(?>[さそしすせ])/,
  tsu => qr/(?>[たとちっつて])/,
  nu => qr/(?>[なのにんぬね])/,
  mu => qr/(?>[まもみんむめ])/,
  ru => qr/(?>[らろりっるれ])/,
  u => qr/(?>[わおいっうえ])/,
  gu => qr/(?>[がごぎいぐげ])/,
  bu => qr/(?>[ばぼびんぶべ])/,
  ichidan => qr/(?>[るれろよ])?/,
  kuru => qr/(?>[るれい])?/,
  suru => qr/(?>す[るれ]|しろ?|せよ?|さ)?/,
  i => qr/(?>か[ろっ]|く|い|けれ|う)?/, ## BUG: ありがたい -> ありがとう
  da => qr/(?>だ[ろっ]?|で|に|なら?)?/,
  dasuru => qr/(?>だ[ろっ]?|で|に|なら?|す[るれ]|しろ?|せよ?|さ)?/,
}->{$param->{suffix}} || qr//;
$pattern =~ s/$suffix_pattern$//;
$pattern .= $suffix_pattern;
$pattern = '\b' . $pattern . '\b' if $param->{aw};

print qq[Content-Type: text/html ; charset=utf-8

<!DOCTYPE HTML>
<html lang=en>
<head>
<title>Search result for "$eword"</title>
<link rel=stylesheet href="/www/style/html/xhtml">
<style>
td {
  vertical-align: top;
}
mark {
  background-color: yellow;
}
.tag {
  color: maroon;
}
.ref {
  color: orange;
}
.rfc2119 {
  font-weight: bolder;
}
.pattern-star {
  font-style: normal;
  color: gray;
  text-decoration: none;
  border-style: none;
}
.pattern-var {
  font-style: italic;
  color: gray;
  text-decoration: none;
  border-style: none;
}
input[type=text] {
  width: 60%;
}
</style>
</head>
<body>
<h1>Search result for "$eword"</h1>];

print_input_form ();

unless ('' =~ /$pattern/) {
  print q[<article>];
  my $has_match;
  for_each_data_file (sub ($) {
    my $data_file_name = shift;
    load_data_file ($data_file_name, my $exact_data = {}, my $pattern_data = {});
    $pattern_data = unescape_patterns ($pattern_data);
    $has_match |= print_matches ($data_file_name, $exact_data, $pattern_data);
  });
  
  unless ($has_match) {
    print q[<p>No match found.</article>];
  } else {
    print q[</article>];
    print_input_form ();
  }
}

sub unescape_patterns ($) {
  my $pattern_data = shift;
  my $new_data = {};
  for (keys %$pattern_data) {
    my $w = $_;
    my $v = $_;
    $v =~ s/\(\.\+\)/*/g;
    $v =~ s/\\([\s\S])/$1/g;
    $new_data->{$v} = $pattern_data->{$w};
  }
  return $new_data;
} # unescape_patterns

sub print_matches ($$$) {
  my ($file_name, $exact_data, $pattern_data) = @_;

  my $file_id = $file_name;
  if ($file_name =~ /([0-9A-Za-z-]+)\.dat$/) {
    $file_id = $1;
  }
  
  my $has_match;
  my $r = qq[<h2>File "<a href="edit/@{[htescape $file_id]}"><code class=file>@{[htescape $file_id]}</code></a>"</h2><table>];
  my $result = 0;
  my $added = {};
  my %en2ja = (%$exact_data, %$pattern_data);
  for (sort {$a cmp $b} keys %en2ja) {
    if (/$pattern/) {
      $has_match = 1;
      $added->{$_} = 1;
      $r .= get_match_text ($_, $en2ja{$_}, $pattern);
      $r .= q[<tr><td colspan=2>...] and last if ++$result == $max_result;
    }
  }
  $result = 0;
  my %ja2en = reverse %en2ja;
  for (sort {$a cmp $b} keys %ja2en) {
    if (/$pattern/) {
      next if $added->{$ja2en{$_}};
      $has_match = 1;
      $r .= get_match_text ($ja2en{$_}, $_, $pattern);
      $r .= q[<tr><td colspan=2>...] and last if ++$result == $max_result;
    }
  }
  $r .= q[</table>];
  print $r if $has_match;
  return $has_match;
} # print_matches

sub get_match_text ($$) {
  my ($en, $ja) = @_;

  ## NOTE: Marking will not work well if it contains &, <, >, or ", or
  ## the pattern matches with charrefs, e.g. "t" (part of &lt; and &quot;).

  my $r = q[<tr><td lang=en>];
  my $v = htescape ($en);
  $v =~ s[($pattern)][<mark>$1</mark>]g;
  $v =~ s[(&lt;[\s\S]+?&gt;)][<span class=tag>$1</span>]g;
  $v =~ s[(&amp;[#0-9A-Za-z]+;)][<span class=ref>$1</span>]g;
  $v =~ s[\*][<var class=pattern-star>*</var>]g;
  $r .= $v;
  $r .= q[<td lang=ja>];
  my $v = htescape ($ja);
  $v =~ s[($pattern)][<mark>$1</mark>]g;
  $v =~ s[(&lt;[\s\S]+?&gt;)][<span class=tag>$1</span>]g;
  $v =~ s[(&amp;[#0-9A-Za-z]+;)][<span class=ref>$1</span>]g;
  $v =~ s{(\[\[[^\[\]]+\]\])}[<span class=rfc2119>$1</span>]g;
  $v =~ s[(\$[0-9]+)][<var class=pattern-var>$1</var>]g;
  $r .= $v;

  return $r;
} # get_match_text

sub print_input_form () {
  print qq[<section><form action=find accept-charset=utf-8 method=get>];
  print qq[<p><input type=text name=word value="@{[htescape $param->{word}]}">];
  print qq[<select name=suffix>];
  for (qw/none ku su tsu nu mu ru u gu bu ichidan suru kuru i da dasuru/) {
    print qq[<option value="@{[htescape $_]}"];
    print qq[ selected] if $param->{suffix} eq $_;
    print qq[>];
    print htescape ({
      none => 'No suffix',
      ku => 'Ka-gyou Godan (-ku)',
      su => 'Sa-gyou Godan (-su)',
      tsu => 'Ta-gyou Godan (-tsu)',
      nu => 'Na-gyou Godan (-nu)',
      mu => 'Ma-gyou Godan (-mu)',
      ru => 'Ra-gyou Godan (-ru)',
      u => 'Wa/a-gyou Godan (-u)',
      gu => 'Ga-gyou Godan (-gu)',
      bu => 'Ba-gyou Godan (-bu)',
      ichidan => 'Ichidan (-iru, -eru)',
      suru => 'Sahen (-suru)',
      kuru => 'Kahen (kuru)',
      i => 'Keiyoushi (-i)',
      da => 'Keiyou-doushi (-da)',
      dasuru => 'Kei-dou-sahen (-da, -suru)',
    }->{$_} || $_);
  }
  print qq[</select>];
  print qq[<input type=submit value=Submit>];
  print qq[<p><label><input type=checkbox name=cs @{[$param->{cs} ? 'checked' : '']}> Case-sensitive</label>
      <label><input type=checkbox name=aw @{[$param->{aw} ? 'checked' : '']}> As word(s)</label>];
  print qq[</form></section>];
} # print_input_form
1	wakaba	1.1	#!/usr/bin/perl
2			use strict;
3	wakaba	1.3	use utf8;
4			use CGI::Carp qw/fatalsToBrowser/;
5	wakaba	1.1
6			BEGIN { require 'common.pl' }
7
8			require Encode;
9
10	wakaba	1.3	my $max_result = 100;
11	wakaba	1.1
12			sub decode_url ($) {
13			my $s = shift;
14			$s =~ tr/+/ /;
15			$s =~ s/%([0-9A-Fa-f]{2})/pack 'C', hex $1/ge;
16			return Encode::decode ('utf-8', $s);
17			} # decode_url
18
19			sub htescape ($) {
20			my $s = shift;
21			$s =~ s/&/&/g;
22			$s =~ s/</</g;
23			$s =~ s/>/>/g;
24			$s =~ s/"/"/g;
25			return $s;
26			} # htescape
27
28			my $param = {};
29			for (split /[&;]/, $ENV{QUERY_STRING} \|\| '') {
30			my ($name, $value) = split /=/, $_, 2;
31			$param->{decode_url ($name)} = decode_url ($value);
32			}
33
34	wakaba	1.3	my $eword = htescape $param->{word};
35
36	wakaba	1.1	my $pattern = quotemeta normalize $param->{word};
37	wakaba	1.4	$pattern =~ s/\\-/[- ]/g;
38	wakaba	1.3	unless ($param->{cs}) {
39			$pattern =~ s/([A-Za-z])/'[' . uc ($1) . lc ($1) . ']'/ge;
40			}
41			my $suffix_pattern = {
42			ku => qr/(?>[かこきいっくけ])/,
43			su => qr/(?>[さそしすせ])/,
44			tsu => qr/(?>[たとちっつて])/,
45			nu => qr/(?>[なのにんぬね])/,
46			mu => qr/(?>[まもみんむめ])/,
47			ru => qr/(?>[らろりっるれ])/,
48			u => qr/(?>[わおいっうえ])/,
49			gu => qr/(?>[がごぎいぐげ])/,
50			bu => qr/(?>[ばぼびんぶべ])/,
51			ichidan => qr/(?>[るれろよ])?/,
52			kuru => qr/(?>[るれい])?/,
53			suru => qr/(?>す[るれ]\|しろ?\|せよ?\|さ)?/,
54			i => qr/(?>か[ろっ]\|く\|い\|けれ\|う)?/, ## BUG: ありがたい -> ありがとう
55			da => qr/(?>だ[ろっ]?\|で\|に\|なら?)?/,
56			dasuru => qr/(?>だ[ろっ]?\|で\|に\|なら?\|す[るれ]\|しろ?\|せよ?\|さ)?/,
57			}->{$param->{suffix}} \|\| qr//;
58			$pattern =~ s/$suffix_pattern$//;
59			$pattern .= $suffix_pattern;
60			$pattern = '\b' . $pattern . '\b' if $param->{aw};
61	wakaba	1.1
62			print qq[Content-Type: text/html ; charset=utf-8
63
64			<!DOCTYPE HTML>
65			<html lang=en>
66			<head>
67			<title>Search result for "$eword"</title>
68			<link rel=stylesheet href="/www/style/html/xhtml">
69			<style>
70	wakaba	1.3	td {
71			vertical-align: top;
72			}
73	wakaba	1.1	mark {
74			background-color: yellow;
75			}
76			.tag {
77			color: maroon;
78			}
79			.ref {
80			color: orange;
81			}
82			.rfc2119 {
83			font-weight: bolder;
84			}
85			.pattern-star {
86			font-style: normal;
87			color: gray;
88			text-decoration: none;
89			border-style: none;
90			}
91			.pattern-var {
92			font-style: italic;
93			color: gray;
94			text-decoration: none;
95			border-style: none;
96			}
97	wakaba	1.3	input[type=text] {
98			width: 60%;
99			}
100	wakaba	1.1	</style>
101			</head>
102			<body>
103			<h1>Search result for "$eword"</h1>];
104
105	wakaba	1.2	print_input_form ();
106
107	wakaba	1.3	unless ('' =~ /$pattern/) {
108			print q[<article>];
109			my $has_match;
110			for_each_data_file (sub ($) {
111			my $data_file_name = shift;
112	wakaba	1.6	load_data_file ($data_file_name, my $exact_data = {}, my $pattern_data = {});
113	wakaba	1.3	$pattern_data = unescape_patterns ($pattern_data);
114			$has_match \|= print_matches ($data_file_name, $exact_data, $pattern_data);
115			});
116
117			unless ($has_match) {
118			print q[<p>No match found.</article>];
119			} else {
120			print q[</article>];
121			print_input_form ();
122			}
123	wakaba	1.1	}
124
125			sub unescape_patterns ($) {
126			my $pattern_data = shift;
127			my $new_data = {};
128			for (keys %$pattern_data) {
129			my $w = $_;
130			my $v = $_;
131			$v =~ s/\(\.\+\)/*/g;
132			$v =~ s/\\([\s\S])/$1/g;
133			$new_data->{$v} = $pattern_data->{$w};
134			}
135			return $new_data;
136			} # unescape_patterns
137
138			sub print_matches ($$$) {
139			my ($file_name, $exact_data, $pattern_data) = @_;
140
141			my $file_id = $file_name;
142			if ($file_name =~ /([0-9A-Za-z-]+)\.dat$/) {
143			$file_id = $1;
144			}
145
146			my $has_match;
147			my $r = qq[<h2>File "<a href="edit/@{[htescape $file_id]}"><code class=file>@{[htescape $file_id]}</code></a>"</h2><table>];
148			my $result = 0;
149			my $added = {};
150			my %en2ja = (%$exact_data, %$pattern_data);
151			for (sort {$a cmp $b} keys %en2ja) {
152			if (/$pattern/) {
153			$has_match = 1;
154			$added->{$_} = 1;
155			$r .= get_match_text ($_, $en2ja{$_}, $pattern);
156			$r .= q[<tr><td colspan=2>...] and last if ++$result == $max_result;
157			}
158			}
159			$result = 0;
160			my %ja2en = reverse %en2ja;
161			for (sort {$a cmp $b} keys %ja2en) {
162			if (/$pattern/) {
163			next if $added->{$ja2en{$_}};
164			$has_match = 1;
165			$r .= get_match_text ($ja2en{$_}, $_, $pattern);
166			$r .= q[<tr><td colspan=2>...] and last if ++$result == $max_result;
167			}
168			}
169			$r .= q[</table>];
170			print $r if $has_match;
171			return $has_match;
172			} # print_matches
173
174			sub get_match_text ($$) {
175			my ($en, $ja) = @_;
176
177			## NOTE: Marking will not work well if it contains &, <, >, or ", or
178			## the pattern matches with charrefs, e.g. "t" (part of < and ").
179
180			my $r = q[<tr><td lang=en>];
181			my $v = htescape ($en);
182			$v =~ s[($pattern)][<mark>$1</mark>]g;
183			$v =~ s[(<[\s\S]+?>)][<span class=tag>$1</span>]g;
184			$v =~ s[(&[#0-9A-Za-z]+;)][<span class=ref>$1</span>]g;
185			$v =~ s[\][<var class=pattern-star></var>]g;
186			$r .= $v;
187			$r .= q[<td lang=ja>];
188			my $v = htescape ($ja);
189			$v =~ s[($pattern)][<mark>$1</mark>]g;
190			$v =~ s[(<[\s\S]+?>)][<span class=tag>$1</span>]g;
191			$v =~ s[(&[#0-9A-Za-z]+;)][<span class=ref>$1</span>]g;
192			$v =~ s{(\[\[[^\[\]]+\]\])}[<span class=rfc2119>$1</span>]g;
193			$v =~ s[(\$[0-9]+)][<var class=pattern-var>$1</var>]g;
194			$r .= $v;
195
196			return $r;
197			} # get_match_text
198	wakaba	1.2
199			sub print_input_form () {
200			print qq[<section><form action=find accept-charset=utf-8 method=get>];
201			print qq[<p><input type=text name=word value="@{[htescape $param->{word}]}">];
202	wakaba	1.3	print qq[<select name=suffix>];
203			for (qw/none ku su tsu nu mu ru u gu bu ichidan suru kuru i da dasuru/) {
204			print qq[<option value="@{[htescape $_]}"];
205			print qq[ selected] if $param->{suffix} eq $_;
206			print qq[>];
207			print htescape ({
208			none => 'No suffix',
209			ku => 'Ka-gyou Godan (-ku)',
210			su => 'Sa-gyou Godan (-su)',
211			tsu => 'Ta-gyou Godan (-tsu)',
212			nu => 'Na-gyou Godan (-nu)',
213			mu => 'Ma-gyou Godan (-mu)',
214			ru => 'Ra-gyou Godan (-ru)',
215			u => 'Wa/a-gyou Godan (-u)',
216			gu => 'Ga-gyou Godan (-gu)',
217			bu => 'Ba-gyou Godan (-bu)',
218			ichidan => 'Ichidan (-iru, -eru)',
219			suru => 'Sahen (-suru)',
220			kuru => 'Kahen (kuru)',
221			i => 'Keiyoushi (-i)',
222			da => 'Keiyou-doushi (-da)',
223			dasuru => 'Kei-dou-sahen (-da, -suru)',
224			}->{$_} \|\| $_);
225			}
226			print qq[</select>];
227			print qq[<input type=submit value=Submit>];
228	wakaba	1.2	print qq[<p><label><input type=checkbox name=cs @{[$param->{cs} ? 'checked' : '']}> Case-sensitive</label>
229			<label><input type=checkbox name=aw @{[$param->{aw} ? 'checked' : '']}> As word(s)</label>];
230			print qq[</form></section>];
231			} # print_input_form