aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/fulltextindex/README.fti30
-rw-r--r--contrib/fulltextindex/fti.c292
-rw-r--r--contrib/fulltextindex/fti.pl10
3 files changed, 190 insertions, 142 deletions
diff --git a/contrib/fulltextindex/README.fti b/contrib/fulltextindex/README.fti
index 10838db6912..5c3d6362a6b 100644
--- a/contrib/fulltextindex/README.fti
+++ b/contrib/fulltextindex/README.fti
@@ -3,11 +3,13 @@ An attempt at some sort of Full Text Indexing for PostgreSQL.
The included software is an attempt to add some sort of Full Text Indexing
support to PostgreSQL. I mean by this that we can ask questions like:
- Give me all rows that have 'still' and 'nash' in the 'artist' field.
+ Give me all rows that have 'still' and 'nash' in the 'artist' or 'title'
+ fields.
Ofcourse we can write this as:
- select * from cds where artist ~* 'stills' and artist ~* 'nash';
+ select * from cds where (artist ~* 'stills' or title ~* 'stills') and
+ (artist ~* 'nash' or title ~* 'nash');
But this does not use any indices, and therefore, if your database
gets very large, it will not have very high performance (the above query
@@ -15,8 +17,8 @@ requires at least one sequential scan, it probably takes 2 due to the
self-join).
The approach used by this add-on is to define a trigger on the table and
-column you want to do this queries on. On every insert in the table, it
-takes the value in the specified column, breaks the text in this column
+columns you want to do this queries on. On every insert in the table, it
+takes the value in the specified columns, breaks the text in these columns
up into pieces, and stores all sub-strings into another table, together
with a reference to the row in the original table that contained this
sub-string (it uses the oid of that row).
@@ -24,8 +26,8 @@ sub-string (it uses the oid of that row).
By now creating an index over the 'fti-table', we can search for
substrings that occur in the original table. By making a join between
the fti-table and the orig-table, we can get the actual rows we want
-(this can also be done by using subselects, and maybe there're other
-ways too).
+(this can also be done by using subselects - but subselects are currently
+inefficient in Postgres, and maybe there're other ways too).
The trigger code also allows an array called StopWords, that prevents
certain words from being indexed.
@@ -62,20 +64,22 @@ The create the function that contains the trigger::
And finally define the trigger on the 'cds' table:
create trigger cds-fti-trigger after update or insert or delete on cds
- for each row execute procedure fti(cds-fti, artist);
+ for each row execute procedure fti(cds-fti, artist, title);
Here, the trigger will be defined on table 'cds', it will create
-sub-strings from the field 'artist', and it will place those sub-strings
-in the table 'cds-fti'.
+sub-strings from the fields 'artist' and 'title', and it will place
+those sub-strings in the table 'cds-fti'.
Now populate the table 'cds'. This will also populate the table 'cds-fti'.
-It's fastest to populate the table *before* you create the indices.
+It's fastest to populate the table *before* you create the indices. Use the
+supplied 'fti.pl' to assist you with this.
Before you start using the system, you should at least have the following
indices:
- create index cds-fti-idx on cds-fti (string, id);
- create index cds-oid-idx on cds (oid);
+ create index cds-fti-idx on cds-fti (string); -- String matching
+ create index cds-fti-idx on cds-fti (id); -- For deleting a cds row
+ create index cds-oid-idx on cds (oid); -- For joining cds to cds-fti
To get the most performance out of this, you should have 'cds-fti'
clustered on disk, ie. all rows with the same sub-strings should be
@@ -109,7 +113,7 @@ clustered : same as above, only clustered : 4.501.321 rows
A sequential scan of the artist_fti table (and thus also the clustered table)
takes around 6:16 minutes....
-Unfortunately I cannot probide anybody else with this test-date, since I
+Unfortunately I cannot provide anybody else with this test-data, since I
am not allowed to redistribute the data (it's a database being sold by
a couple of wholesale companies). Anyways, it's megabytes, so you probably
wouldn't want it in this distribution anyways.
diff --git a/contrib/fulltextindex/fti.c b/contrib/fulltextindex/fti.c
index a797505e701..10e40fd5dd2 100644
--- a/contrib/fulltextindex/fti.c
+++ b/contrib/fulltextindex/fti.c
@@ -6,61 +6,87 @@
#include "commands/trigger.h"
/*
- * Trigger function takes 2 arguments:
- 1. relation in which to store the substrings
- 2. field to extract substrings from
-
- The relation in which to insert *must* have the following layout:
-
- string varchar(#)
- id oid
-
- Example:
-
-create function fti() returns opaque as
-'/home/boekhold/src/postgresql-6.2/contrib/fti/fti.so' language 'C';
-
-create table title_fti (string varchar(25), id oid);
-create index title_fti_idx on title_fti (string);
-
-create trigger title_fti_trigger after update or insert or delete on product
-for each row execute procedure fti(title_fti, title);
- ^^^^^^^^^
- where to store index in
- ^^^^^
- which column to index
-
-ofcourse don't forget to create an index on title_idx, column string, else
-you won't notice much speedup :)
-
-After populating 'product', try something like:
-
-select p.* from product p, title_fti f1, title_fti f2 where
- f1.string='slippery' and f2.string='wet' and f1.id=f2.id and p.oid=f1.id;
-*/
-
-/*
- march 4 1998 Changed breakup() to return less substrings. Only breakup
- in word parts which are in turn shortened from the start
- of the word (ie. word, ord, rd)
- Did allocation of substring buffer outside of breakup()
- oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
- characters between words then 1).
-
- oct 4-5 1997 implemented the thing, at least the basic functionallity
- of it all....
-*/
-
-/* IMPROVEMENTS:
-
- save a plan for deletes
- create a function that will make the index *after* we have populated
- the main table (probably first delete all contents to be sure there's
- nothing in it, then re-populate the fti-table)
-
- can we do something with operator overloading or a seperate function
- that can build the final query automatigally?
- */
+ * Trigger function accepts variable number of arguments:
+ *
+ * 1. relation in which to store the substrings
+ * 2. fields to extract substrings from
+ *
+ * The relation in which to insert *must* have the following layout:
+ *
+ * string varchar(#)
+ * id oid
+ *
+ * where # is the largest size of the varchar columns being indexed
+ *
+ * Example:
+ *
+ * -- Create the SQL function based on the compiled shared object
+ * create function fti() returns opaque as
+ * '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
+ *
+ * -- Create the FTI table
+ * create table product_fti (string varchar(255), id oid);
+ *
+ * -- Create an index to assist string matches
+ * create index product_fti_string_idx on product_fti (string);
+ *
+ * -- Create an index to assist trigger'd deletes
+ * create index product_fti_id_idx on product_fti (id);
+ *
+ * -- Create an index on the product oid column to assist joins
+ * -- between the fti table and the product table
+ * create index product_oid_idx on product (oid);
+ *
+ * -- Create the trigger to perform incremental changes to the full text index.
+ * create trigger product_fti_trig after update or insert or delete on product
+ * for each row execute procedure fti(product_fti, title, artist);
+ * ^^^^^^^^^^^
+ * table where full text index is stored
+ * ^^^^^^^^^^^^^
+ * columns to index in the base table
+ *
+ * After populating 'product', try something like:
+ *
+ * SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE
+ * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND p.oid=f2.id;
+ *
+ * To check that your indicies are being used correctly, make sure you
+ * EXPLAIN SELECT ... your test query above.
+ *
+ * CHANGELOG
+ * ---------
+ *
+ * august 3 2001
+ * Extended fti function to accept more than one column as a
+ * parameter and all specified columns are indexed. Changed
+ * all uses of sprintf to snprintf. Made error messages more
+ * consistent.
+ *
+ * march 4 1998 Changed breakup() to return less substrings. Only breakup
+ * in word parts which are in turn shortened from the start
+ * of the word (ie. word, ord, rd)
+ * Did allocation of substring buffer outside of breakup()
+ *
+ * oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
+ * characters between words then 1).
+ *
+ * oct 4-5 1997 implemented the thing, at least the basic functionallity
+ * of it all....
+ *
+ * TODO
+ * ----
+ *
+ * prevent generating duplicate words for an oid in the fti table
+ * save a plan for deletes
+ * create a function that will make the index *after* we have populated
+ * the main table (probably first delete all contents to be sure there's
+ * nothing in it, then re-populate the fti-table)
+ *
+ * can we do something with operator overloading or a seperate function
+ * that can build the final query automatigally?
+ */
+
+#define MAX_FTI_QUERY_LENGTH 8192
extern Datum fti(PG_FUNCTION_ARGS);
static char *breakup(char *, char *);
@@ -81,10 +107,10 @@ char *StopWords[] = { /* list of words to skip in indexing */
/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
typedef struct
{
- char *ident;
- int nplans;
- void **splan;
-} EPlan;
+ char *ident;
+ int nplans;
+ void **splan;
+} EPlan;
static EPlan *InsertPlans = NULL;
static EPlan *DeletePlans = NULL;
@@ -99,7 +125,7 @@ PG_FUNCTION_INFO_V1(fti);
Datum
fti(PG_FUNCTION_ARGS)
{
- TriggerData *trigdata = (TriggerData *) fcinfo->context;
+ TriggerData *trigdata;
Trigger *trigger; /* to get trigger name */
int nargs; /* # of arguments */
char **args; /* arguments */
@@ -111,7 +137,7 @@ fti(PG_FUNCTION_ARGS)
bool isinsert = false;
bool isdelete = false;
int ret;
- char query[8192];
+ char query[MAX_FTI_QUERY_LENGTH];
Oid oid;
/*
@@ -124,11 +150,15 @@ fti(PG_FUNCTION_ARGS)
*/
if (!CALLED_AS_TRIGGER(fcinfo))
- elog(ERROR, "Full Text Indexing: not fired by trigger manager");
+ elog(ERROR, "Full Text Indexing: Not fired by trigger manager");
+
+ /* It's safe to cast now that we've checked */
+ trigdata = (TriggerData *) fcinfo->context;
+
if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
- elog(ERROR, "Full Text Indexing: can't process STATEMENT events");
+ elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
- elog(ERROR, "Full Text Indexing: must be fired AFTER event");
+ elog(ERROR, "Full Text Indexing: Must be fired AFTER event");
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
isinsert = true;
@@ -148,11 +178,11 @@ fti(PG_FUNCTION_ARGS)
rettuple = trigdata->tg_newtuple;
if ((ret = SPI_connect()) < 0)
- elog(ERROR, "Full Text Indexing: SPI_connect failed, returned %d\n", ret);
+ elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n", ret);
nargs = trigger->tgnargs;
- if (nargs != 2)
- elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments");
+ if (nargs < 2)
+ elog(ERROR, "Full Text Indexing: Trigger must have at least 2 arguments\n");
args = trigger->tgargs;
indexname = args[0];
@@ -161,7 +191,7 @@ fti(PG_FUNCTION_ARGS)
/* get oid of current tuple, needed by all, so place here */
oid = rettuple->t_data->t_oid;
if (!OidIsValid(oid))
- elog(ERROR, "Full Text Indexing: oid of current tuple is NULL");
+ elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");
if (isdelete)
{
@@ -169,8 +199,14 @@ fti(PG_FUNCTION_ARGS)
Oid *argtypes;
Datum values[1];
EPlan *plan;
+ int i;
+
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
+ for (i = 1; i < nargs; i++)
+ {
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
+ }
- sprintf(query, "D%s$%s", args[0], args[1]);
plan = find_plan(query, &DeletePlans, &nDeletePlans);
if (plan->nplans <= 0)
{
@@ -178,15 +214,13 @@ fti(PG_FUNCTION_ARGS)
argtypes[0] = OIDOID;
- sprintf(query, "DELETE FROM %s WHERE id = $1", indexname);
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", indexname);
pplan = SPI_prepare(query, 1, argtypes);
if (!pplan)
- elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
- "in delete");
+ elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
- elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL "
- "in delete");
+ elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in delete");
plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
@@ -197,21 +231,29 @@ fti(PG_FUNCTION_ARGS)
ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_DELETE)
- elog(ERROR, "Full Text Indexing: error executing plan in delete");
+ elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in delete");
}
if (isinsert)
{
- char *substring,
- *column;
- void *pplan;
- Oid *argtypes;
+ char *substring;
+ char *column;
+ void *pplan;
+ Oid *argtypes;
Datum values[2];
int colnum;
- struct varlena *data;
- EPlan *plan;
+ struct varlena *data;
+ EPlan *plan;
+ int i;
+ char *buff;
+ char *string;
+
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
+ for (i = 1; i < nargs; i++)
+ {
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
+ }
- sprintf(query, "I%s$%s", args[0], args[1]);
plan = find_plan(query, &InsertPlans, &nInsertPlans);
/* no plan yet, so allocate mem for argtypes */
@@ -224,67 +266,65 @@ fti(PG_FUNCTION_ARGS)
argtypes[1] = OIDOID; /* id oid); */
/* prepare plan to gain speed */
- sprintf(query, "INSERT INTO %s (string, id) VALUES ($1, $2)",
+ snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES ($1, $2)",
indexname);
pplan = SPI_prepare(query, 2, argtypes);
if (!pplan)
- elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
- "in insert");
+ elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
- elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL"
- " in insert");
+ elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in insert");
plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
plan->nplans = 1;
}
-
/* prepare plan for query */
- colnum = SPI_fnumber(tupdesc, args[1]);
- if (colnum == SPI_ERROR_NOATTRIBUTE)
- elog(ERROR, "Full Text Indexing: column '%s' of '%s' not found",
- args[1], args[0]);
-
- /* Get the char* representation of the column with name args[1] */
- column = SPI_getvalue(rettuple, tupdesc, colnum);
-
- if (column)
- { /* make sure we don't try to index NULL's */
- char *buff;
- char *string = column;
-
- while (*string != '\0')
- {
- *string = tolower((unsigned char) *string);
- string++;
- }
+ for (i = 0; i < nargs - 1; i++)
+ {
+ colnum = SPI_fnumber(tupdesc, args[i + 1]);
+ if (colnum == SPI_ERROR_NOATTRIBUTE)
+ elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not found", args[i + 1], indexname);
- data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
- buff = palloc(strlen(column) + 1);
- /* saves lots of calls in while-loop and in breakup() */
+ /* Get the char* representation of the column */
+ column = SPI_getvalue(rettuple, tupdesc, colnum);
- new_tuple = true;
- while ((substring = breakup(column, buff)))
+ /* make sure we don't try to index NULL's */
+ if (column)
{
- int l;
-
- l = strlen(substring);
-
- data->vl_len = l + sizeof(int32);
- memcpy(VARDATA(data), substring, l);
- values[0] = PointerGetDatum(data);
- values[1] = oid;
-
- ret = SPI_execp(*(plan->splan), values, NULL, 0);
- if (ret != SPI_OK_INSERT)
- elog(ERROR, "Full Text Indexing: error executing plan "
- "in insert");
+ string = column;
+ while (*string != '\0')
+ {
+ *string = tolower((unsigned char) *string);
+ string++;
+ }
+
+ data = (struct varlena *) palloc(sizeof(int32) + strlen(column) + 1);
+ buff = palloc(strlen(column) + 1);
+ /* saves lots of calls in while-loop and in breakup() */
+
+ new_tuple = true;
+
+ while ((substring = breakup(column, buff)))
+ {
+ int l;
+
+ l = strlen(substring);
+
+ data->vl_len = l + sizeof(int32);
+ memcpy(VARDATA(data), substring, l);
+ values[0] = PointerGetDatum(data);
+ values[1] = oid;
+
+ ret = SPI_execp(*(plan->splan), values, NULL, 0);
+ if (ret != SPI_OK_INSERT)
+ elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in insert");
+ }
+ pfree(buff);
+ pfree(data);
}
- pfree(buff);
- pfree(data);
}
}
diff --git a/contrib/fulltextindex/fti.pl b/contrib/fulltextindex/fti.pl
index 230ba927033..58080a5425c 100644
--- a/contrib/fulltextindex/fti.pl
+++ b/contrib/fulltextindex/fti.pl
@@ -17,7 +17,7 @@
#
# Example:
#
-# fti.pl -u -d mydb -t mytable -c mycolumn -f myfile
+# fti.pl -u -d mydb -t mytable -c mycolumn,mycolumn2 -f myfile
# sort -o myoutfile myfile
# uniq myoutfile sorted-file
#
@@ -140,11 +140,13 @@ sub main {
getopts('d:t:c:f:u');
if (!$opt_d || !$opt_t || !$opt_c || !$opt_f) {
- print STDERR "usage: $0 [-u] -d database -t table -c column ".
+ print STDERR "usage: $0 [-u] -d database -t table -c column[,column...] ".
"-f output-file\n";
return 1;
}
+ @cols = split(/,/, $opt_c);
+
if (defined($opt_u)) {
$uname = get_username();
$pwd = get_password();
@@ -166,7 +168,9 @@ sub main {
PQexec($PG_CONN, "begin");
- $query = "declare C cursor for select $opt_c, oid from $opt_t";
+ $query = "declare C cursor for select (\"";
+ $query .= join("\" || ' ' || \"", @cols);
+ $query .= "\") as string, oid from $opt_t";
$res = PQexec($PG_CONN, $query);
if (!$res || (PQresultStatus($res) != $PGRES_COMMAND_OK)) {
print STDERR "Error declaring cursor!\n";