aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Momjian <bruce@momjian.us>2001-08-01 18:40:12 +0000
committerBruce Momjian <bruce@momjian.us>2001-08-01 18:40:12 +0000
commit938236a29716c754a9a9238e377c3cd15db11dde (patch)
tree4f11bf16438f5cba7371f0384f877ccb80c93398
parent8c6761acc7920f80ff871f92a4143735b77ef09b (diff)
downloadpostgresql-938236a29716c754a9a9238e377c3cd15db11dde.tar.gz
postgresql-938236a29716c754a9a9238e377c3cd15db11dde.zip
The fti.pl supplied with the fulltextindex module generate ALL possible
substrings of two characters or greater, and is case-sensitive. This patch makes it work correctly. It generates only the suffixes of each word, plus lowercases them - as specified by the README file. This brings it into line with the fti.c function, makes it case-insensitive properly, removes the problem with duplicate rows being returned from an fti search and greatly reduces the size of the generated index table. It was written by my co-worker, Brett Toolin. Christopher Kings-Lynne
-rw-r--r--contrib/fulltextindex/fti.pl25
1 files changed, 13 insertions, 12 deletions
diff --git a/contrib/fulltextindex/fti.pl b/contrib/fulltextindex/fti.pl
index 02bf057e94a..230ba927033 100644
--- a/contrib/fulltextindex/fti.pl
+++ b/contrib/fulltextindex/fti.pl
@@ -1,6 +1,6 @@
#!/usr/bin/perl
#
-# This script substracts all substrings out of a specific column in a table
+# This script substracts all suffixes of all words in a specific column in a table
# and generates output that can be loaded into a new table with the
# psql '\copy' command. The new table should have the following structure:
#
@@ -52,27 +52,28 @@ $PGRES_BAD_RESPONSE = 5 ;
$PGRES_NONFATAL_ERROR = 6 ;
$PGRES_FATAL_ERROR = 7 ;
+# the minimum length of word to include in the full text index
+$MIN_WORD_LENGTH = 2;
+
+# the minimum length of the substrings in the full text index
+$MIN_SUBSTRING_LENGTH = 2;
+
$[ = 0; # make sure string offsets start at 0
sub break_up {
my $string = pop @_;
+ # convert strings to lower case
+ $string = lc($string);
@strings = split(/\W+/, $string);
@subs = ();
foreach $s (@strings) {
$len = length($s);
- next if ($len < 4);
-
- $lpos = $len-1;
- while ($lpos >= 3) {
- $fpos = $lpos - 3;
- while ($fpos >= 0) {
- $sub = substr($s, $fpos, $lpos - $fpos + 1);
- push(@subs, $sub);
- $fpos = $fpos - 1;
- }
- $lpos = $lpos - 1;
+ next if ($len <= $MIN_WORD_LENGTH);
+ for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
+ $tmp = substr($s, $i);
+ push(@subs, $tmp);
}
}