aboutsummaryrefslogtreecommitdiff
path: root/src/backend/snowball/snowball_create.pl
blob: f4b58ada1cb14898ca7a4358598cb214e5938925 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/perl

use strict;
use warnings;
use Getopt::Long;

my $outdir_path = '';
my $makefile_path = '';
my $input_path = '';
my $depfile;

our @languages = qw(
	arabic
	armenian
	basque
	catalan
	danish
	dutch
	english
	finnish
	french
	german
	greek
	hindi
	hungarian
	indonesian
	irish
	italian
	lithuanian
	nepali
	norwegian
	portuguese
	romanian
	russian
	serbian
	spanish
	swedish
	tamil
	turkish
	yiddish
);

# Names of alternative dictionaries for all-ASCII words.  If not
# listed, the language itself is used.  Note order dependency: Use of
# some other language as ASCII dictionary must come after creation of
# that language, so the "backup" language must be listed earlier in
# @languages.

our %ascii_languages = (
	'hindi' => 'english',
	'russian' => 'english',
);

GetOptions(
	'depfile'    => \$depfile,
	'outdir:s'   => \$outdir_path,
	'input:s'    => \$input_path) || usage();

# Make sure input_path ends in a slash if needed.
if ($input_path ne '' && substr($input_path, -1) ne '/')
{
	$outdir_path .= '/';
}

# Make sure outdir_path ends in a slash if needed.
if ($outdir_path ne '' && substr($outdir_path, -1) ne '/')
{
	$outdir_path .= '/';
}

GenerateTsearchFiles();

sub usage
{
	die <<EOM;
Usage: snowball_create.pl --input/-i <path> --outdir/-o <path>
    --depfile       Write dependency file
    --outdir        Output directory (default '.')
    --input         Input directory

snowball_create.pl creates snowball.sql from snowball.sql.in
EOM
}

sub GenerateTsearchFiles
{
	my $target = shift;
	my $outdir_file = "$outdir_path/snowball_create.sql";

	my $F;
	my $D;
	my $tmpl = read_file("$input_path/snowball.sql.in");

	if ($depfile)
	{
		open($D, '>', "$outdir_path/snowball_create.dep")
		  || die "Could not write snowball_create.dep";
	}

	print $D "$outdir_file: $input_path/snowball.sql.in\n" if $depfile;
	print $D "$outdir_file: $input_path/snowball_func.sql.in\n" if $depfile;

	open($F, '>', $outdir_file)
	  || die "Could not write snowball_create.sql";

	print $F "-- Language-specific snowball dictionaries\n";

	print $F read_file("$input_path/snowball_func.sql.in");

	foreach my $lang (@languages)
	{
		my $asclang = $ascii_languages{$lang} || $lang;
		my $txt     = $tmpl;
		my $stop    = '';
		my $stopword_path = "$input_path/stopwords/$lang.stop";

		if (-s "$stopword_path")
		{
			$stop = ", StopWords=$lang";

			print $D "$outdir_file: $stopword_path\n" if $depfile;
		}

		$txt =~ s#_LANGNAME_#${lang}#gs;
		$txt =~ s#_DICTNAME_#${lang}_stem#gs;
		$txt =~ s#_CFGNAME_#${lang}#gs;
		$txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
		$txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
		$txt =~ s#_STOPWORDS_#$stop#gs;
		print $F $txt;
	}
	close($F);
	close($D) if $depfile;
	return;
}


sub read_file
{
	my $filename = shift;
	my $F;
	local $/ = undef;
	open($F, '<', $filename) || die "Could not open file $filename\n";
	my $txt = <$F>;
	close($F);

	return $txt;
}