diff options
Diffstat (limited to 'src/backend/utils')
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 923 | ||||
-rw-r--r-- | src/backend/utils/cache/lsyscache.c | 259 | ||||
-rw-r--r-- | src/backend/utils/cache/syscache.c | 4 | ||||
-rw-r--r-- | src/backend/utils/sort/tuplesort.c | 233 |
4 files changed, 898 insertions, 521 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 1fe0afb0a35..41ba82db7b5 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -15,7 +15,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.87 2001/03/23 04:49:54 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.88 2001/05/07 00:43:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -57,9 +57,6 @@ /* default selectivity estimate for pattern-match operators such as LIKE */ #define DEFAULT_MATCH_SEL 0.01 -/* "fudge factor" for estimating frequency of not-most-common values */ -#define NOT_MOST_COMMON_RATIO 0.1 - static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue, Datum lobound, Datum hibound, Oid boundstypid, double *scaledlobound, double *scaledhibound); @@ -75,17 +72,9 @@ static double convert_one_string_to_scalar(unsigned char *value, static unsigned char *convert_string_datum(Datum value, Oid typid); static double convert_timevalue_to_scalar(Datum value, Oid typid); static void getattproperties(Oid relid, AttrNumber attnum, - Oid *typid, - int *typlen, - bool *typbyval, - int32 *typmod); -static bool getattstatistics(Oid relid, AttrNumber attnum, - Oid typid, int32 typmod, - double *nullfrac, - double *commonfrac, - Datum *commonval, - Datum *loval, - Datum *hival); + Oid *typid, int32 *typmod); +static double get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid, + Form_pg_statistic stats); static Selectivity prefix_selectivity(char *prefix, Oid relid, AttrNumber attno, @@ -115,134 +104,173 @@ eqsel(PG_FUNCTION_ARGS) AttrNumber attno = PG_GETARG_INT16(2); Datum value = PG_GETARG_DATUM(3); int32 flag = PG_GETARG_INT32(4); - float8 result; - - if (NONVALUE(attno) || NONVALUE(relid)) - result = DEFAULT_EQ_SEL; - else + Oid typid; + int32 typmod; + HeapTuple statsTuple; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + double selec; + + if (NONVALUE(relid) || NONVALUE(attno)) + PG_RETURN_FLOAT8(DEFAULT_EQ_SEL); + + /* get info about the attribute */ + getattproperties(relid, attno, &typid, &typmod); + + /* get stats for the attribute, if available */ + statsTuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(attno), + 0, 0); + if (HeapTupleIsValid(statsTuple)) { - Oid typid; - int typlen; - bool typbyval; - int32 typmod; - double nullfrac; - double commonfrac; - Datum commonval; - double selec; - - /* get info about the attribute */ - getattproperties(relid, attno, - &typid, &typlen, &typbyval, &typmod); - - /* get stats for the attribute, if available */ - if (getattstatistics(relid, attno, typid, typmod, - &nullfrac, &commonfrac, &commonval, - NULL, NULL)) - { - if (flag & SEL_CONSTANT) - { + Form_pg_statistic stats; - /* - * Is the constant "=" to the column's most common value? - * (Although the operator may not really be "=", we will - * assume that seeing whether it returns TRUE for the most - * common value is useful information. If you don't like - * it, maybe you shouldn't be using eqsel for your - * operator...) - */ - RegProcedure eqproc = get_opcode(opid); - bool mostcommon; + stats = (Form_pg_statistic) GETSTRUCT(statsTuple); - if (eqproc == (RegProcedure) NULL) - elog(ERROR, "eqsel: no procedure for operator %u", - opid); + if (flag & SEL_CONSTANT) + { + bool match = false; + int i; - /* be careful to apply operator right way 'round */ - if (flag & SEL_RIGHT) - mostcommon = DatumGetBool(OidFunctionCall2(eqproc, - commonval, - value)); - else - mostcommon = DatumGetBool(OidFunctionCall2(eqproc, - value, - commonval)); + /* + * Is the constant "=" to any of the column's most common + * values? (Although the given operator may not really be + * "=", we will assume that seeing whether it returns TRUE + * is an appropriate test. If you don't like this, maybe you + * shouldn't be using eqsel for your operator...) + */ + if (get_attstatsslot(statsTuple, typid, typmod, + STATISTIC_KIND_MCV, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) + { + FmgrInfo eqproc; - if (mostcommon) - { + fmgr_info(get_opcode(opid), &eqproc); - /* - * Constant is "=" to the most common value. We know - * selectivity exactly (or as exactly as VACUUM could - * calculate it, anyway). - */ - selec = commonfrac; - } - else + for (i = 0; i < nvalues; i++) { - - /* - * Comparison is against a constant that is neither - * the most common value nor null. Its selectivity - * cannot be more than this: - */ - selec = 1.0 - commonfrac - nullfrac; - if (selec > commonfrac) - selec = commonfrac; - - /* - * and in fact it's probably less, so we should apply - * a fudge factor. The only case where we don't is - * for a boolean column, where indeed we have - * estimated the less-common value's frequency - * exactly! - */ - if (typid != BOOLOID) - selec *= NOT_MOST_COMMON_RATIO; + /* be careful to apply operator right way 'round */ + if (flag & SEL_RIGHT) + match = DatumGetBool(FunctionCall2(&eqproc, + values[i], + value)); + else + match = DatumGetBool(FunctionCall2(&eqproc, + value, + values[i])); + if (match) + break; } } else { + /* no most-common-value info available */ + values = NULL; + numbers = NULL; + i = nvalues = nnumbers = 0; + } + if (match) + { + /* + * Constant is "=" to this common value. We know + * selectivity exactly (or as exactly as VACUUM + * could calculate it, anyway). + */ + selec = numbers[i]; + } + else + { /* - * Search is for a value that we do not know a priori, but - * we will assume it is not NULL. Selectivity cannot be - * more than this: + * Comparison is against a constant that is neither + * NULL nor any of the common values. Its selectivity + * cannot be more than this: */ - selec = 1.0 - nullfrac; - if (selec > commonfrac) - selec = commonfrac; + double sumcommon = 0.0; + double otherdistinct; + for (i = 0; i < nnumbers; i++) + sumcommon += numbers[i]; + selec = 1.0 - sumcommon - stats->stanullfrac; + /* + * and in fact it's probably a good deal less. + * We approximate that all the not-common values + * share this remaining fraction equally, so we + * divide by the number of other distinct values. + */ + otherdistinct = get_att_numdistinct(relid, attno, + typid, stats) + - nnumbers; + if (otherdistinct > 1) + selec /= otherdistinct; /* - * and in fact it's probably less, so apply a fudge - * factor. + * Another cross-check: selectivity shouldn't be + * estimated as more than the least common + * "most common value". */ - selec *= NOT_MOST_COMMON_RATIO; + if (nnumbers > 0 && selec > numbers[nnumbers-1]) + selec = numbers[nnumbers-1]; } - /* result should be in range, but make sure... */ - if (selec < 0.0) - selec = 0.0; - else if (selec > 1.0) - selec = 1.0; - - if (!typbyval) - pfree(DatumGetPointer(commonval)); + free_attstatsslot(typid, values, nvalues, numbers, nnumbers); } else { + double ndistinct; /* - * No VACUUM ANALYZE stats available, so make a guess using - * the dispersion stat (if we have that, which is unlikely for - * a normal attribute; but for a system attribute we may be - * able to estimate it). + * Search is for a value that we do not know a priori, but + * we will assume it is not NULL. Estimate the selectivity + * as non-null fraction divided by number of distinct values, + * so that we get a result averaged over all possible values + * whether common or uncommon. (Essentially, we are assuming + * that the not-yet-known comparison value is equally likely + * to be any of the possible values, regardless of their + * frequency in the table. Is that a good idea?) + */ + selec = 1.0 - stats->stanullfrac; + ndistinct = get_att_numdistinct(relid, attno, typid, stats); + if (ndistinct > 1) + selec /= ndistinct; + /* + * Cross-check: selectivity should never be + * estimated as more than the most common value's. */ - selec = get_attdispersion(relid, attno, 0.01); + if (get_attstatsslot(statsTuple, typid, typmod, + STATISTIC_KIND_MCV, InvalidOid, + NULL, NULL, + &numbers, &nnumbers)) + { + if (nnumbers > 0 && selec > numbers[0]) + selec = numbers[0]; + free_attstatsslot(typid, NULL, 0, numbers, nnumbers); + } } - result = (float8) selec; + ReleaseSysCache(statsTuple); } - PG_RETURN_FLOAT8(result); + else + { + /* + * No VACUUM ANALYZE stats available, so make a guess using + * estimated number of distinct values and assuming they are + * equally common. (The guess is unlikely to be very good, + * but we do know a few special cases.) + */ + selec = 1.0 / get_att_numdistinct(relid, attno, typid, NULL); + } + + /* result should be in range, but make sure... */ + if (selec < 0.0) + selec = 0.0; + else if (selec > 1.0) + selec = 1.0; + + PG_RETURN_FLOAT8((float8) selec); } /* @@ -301,117 +329,263 @@ scalarltsel(PG_FUNCTION_ARGS) AttrNumber attno = PG_GETARG_INT16(2); Datum value = PG_GETARG_DATUM(3); int32 flag = PG_GETARG_INT32(4); - float8 result; + bool isgt; + HeapTuple oprTuple; + HeapTuple statsTuple; + Form_pg_statistic stats; + Oid contype; + FmgrInfo opproc; + Oid typid; + int32 typmod; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + double mcv_selec, + hist_selec, + sumcommon; + double selec; + int i; + + if (NONVALUE(relid) || NONVALUE(attno)) + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + + /* Can't do anything useful if no constant to compare against, either */ + if (!(flag & SEL_CONSTANT)) + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); - if (!(flag & SEL_CONSTANT) || NONVALUE(attno) || NONVALUE(relid)) - result = DEFAULT_INEQ_SEL; + /* + * Force the constant to be on the right to simplify later logic. + * This means that we may be dealing with either "<" or ">" cases. + */ + if (flag & SEL_RIGHT) + { + /* we have x < const */ + isgt = false; + } else { - HeapTuple oprtuple; - Oid ltype, - rtype, - contype; - Oid typid; - int typlen; - bool typbyval; - int32 typmod; - Datum hival, - loval; - double val, - high, - low, - numerator, - denominator; - - /* - * Get left and right datatypes of the operator so we know what - * type the constant is. - */ - oprtuple = SearchSysCache(OPEROID, - ObjectIdGetDatum(opid), - 0, 0, 0); - if (!HeapTupleIsValid(oprtuple)) - elog(ERROR, "scalarltsel: no tuple for operator %u", opid); - ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft; - rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright; - contype = (flag & SEL_RIGHT) ? rtype : ltype; - ReleaseSysCache(oprtuple); - - /* Now get info and stats about the attribute */ - getattproperties(relid, attno, - &typid, &typlen, &typbyval, &typmod); - - if (!getattstatistics(relid, attno, typid, typmod, - NULL, NULL, NULL, - &loval, &hival)) + /* we have const < x, commute to make x > const */ + opid = get_commutator(opid); + if (!opid) { - /* no stats available, so default result */ + /* Use default selectivity (should we raise an error instead?) */ PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } + isgt = true; + } - /* Convert the values to a uniform comparison scale. */ - if (!convert_to_scalar(value, contype, &val, - loval, hival, typid, - &low, &high)) - { + /* + * The constant might not be the same datatype as the column; + * look at the operator's input types to find out what it is. + * Also set up to be able to call the operator's execution proc. + */ + oprTuple = SearchSysCache(OPEROID, + ObjectIdGetDatum(opid), + 0, 0, 0); + if (!HeapTupleIsValid(oprTuple)) + elog(ERROR, "scalarltsel: no tuple for operator %u", opid); + contype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright; + fmgr_info(((Form_pg_operator) GETSTRUCT(oprTuple))->oprcode, &opproc); + ReleaseSysCache(oprTuple); + + /* Now get info and stats about the attribute */ + getattproperties(relid, attno, &typid, &typmod); + + statsTuple = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid), + Int16GetDatum(attno), + 0, 0); + if (!HeapTupleIsValid(statsTuple)) + { + /* no stats available, so default result */ + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); + } + stats = (Form_pg_statistic) GETSTRUCT(statsTuple); - /* - * Ideally we'd produce an error here, on the grounds that the - * given operator shouldn't have scalarltsel registered as its - * selectivity func unless we can deal with its operand types. - * But currently, all manner of stuff is invoking scalarltsel, - * so give a default estimate until that can be fixed. - */ - if (!typbyval) - { - pfree(DatumGetPointer(hival)); - pfree(DatumGetPointer(loval)); - } - PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); - } + /* + * If we have most-common-values info, add up the fractions of the + * MCV entries that satisfy MCV OP CONST. These fractions contribute + * directly to the result selectivity. Also add up the total fraction + * represented by MCV entries. + */ + mcv_selec = 0.0; + sumcommon = 0.0; - /* release temp storage if needed */ - if (!typbyval) + if (get_attstatsslot(statsTuple, typid, typmod, + STATISTIC_KIND_MCV, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) + { + for (i = 0; i < nvalues; i++) { - pfree(DatumGetPointer(hival)); - pfree(DatumGetPointer(loval)); + if (DatumGetBool(FunctionCall2(&opproc, + values[i], + value))) + mcv_selec += numbers[i]; + sumcommon += numbers[i]; } + free_attstatsslot(typid, values, nvalues, numbers, nnumbers); + } + + /* + * If there is a histogram, determine which bin the constant falls in, + * and compute the resulting contribution to selectivity. + * + * Someday, VACUUM might store more than one histogram per rel/att, + * corresponding to more than one possible sort ordering defined for + * the column type. However, to make that work we will need to figure + * out which staop to search for --- it's not necessarily the one we + * have at hand! (For example, we might have a '<=' operator rather + * than the '<' operator that will appear in staop.) For now, assume + * that whatever appears in pg_statistic is sorted the same way our + * operator sorts. + */ + hist_selec = 0.0; - if (high <= low) + if (get_attstatsslot(statsTuple, typid, typmod, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + &values, &nvalues, + NULL, NULL)) + { + if (nvalues > 1) { + double histfrac; + bool ltcmp; + + ltcmp = DatumGetBool(FunctionCall2(&opproc, + values[0], + value)); + if (isgt) + ltcmp = !ltcmp; + if (!ltcmp) + { + /* Constant is below lower histogram boundary. */ + histfrac = 0.0; + } + else + { + /* + * Scan to find proper location. This could be made faster + * by using a binary-search method, but it's probably not + * worth the trouble for typical histogram sizes. + */ + for (i = 1; i < nvalues; i++) + { + ltcmp = DatumGetBool(FunctionCall2(&opproc, + values[i], + value)); + if (isgt) + ltcmp = !ltcmp; + if (!ltcmp) + break; + } + if (i >= nvalues) + { + /* Constant is above upper histogram boundary. */ + histfrac = 1.0; + } + else + { + double val, + high, + low; + double binfrac; + /* + * We have values[i-1] < constant < values[i]. + * + * Convert the constant and the two nearest bin boundary + * values to a uniform comparison scale, and do a linear + * interpolation within this bin. + */ + if (convert_to_scalar(value, contype, &val, + values[i-1], values[i], typid, + &low, &high)) + { + if (high <= low) + { + /* cope if bin boundaries appear identical */ + binfrac = 0.5; + } + else if (val <= low) + binfrac = 0.0; + else if (val >= high) + binfrac = 1.0; + else + binfrac = (val - low) / (high - low); + } + else + { + /* + * Ideally we'd produce an error here, on the grounds + * that the given operator shouldn't have scalarltsel + * registered as its selectivity func unless we can + * deal with its operand types. But currently, all + * manner of stuff is invoking scalarltsel, so give a + * default estimate until that can be fixed. + */ + binfrac = 0.5; + } + /* + * Now, compute the overall selectivity across the values + * represented by the histogram. We have i-1 full bins + * and binfrac partial bin below the constant. + */ + histfrac = (double) (i-1) + binfrac; + histfrac /= (double) (nvalues - 1); + } + } /* - * If we trusted the stats fully, we could return a small or - * large selec depending on which side of the single data - * point the constant is on. But it seems better to assume - * that the stats are wrong and return a default... + * Now histfrac = fraction of histogram entries below the constant. + * + * Account for "<" vs ">" */ - result = DEFAULT_INEQ_SEL; - } - else if (val < low || val > high) - { - + hist_selec = isgt ? (1.0 - histfrac) : histfrac; /* - * If given value is outside the statistical range, return a - * small or large value; but not 0.0/1.0 since there is a - * chance the stats are out of date. + * The histogram boundaries are only approximate to begin + * with, and may well be out of date anyway. Therefore, + * don't believe extremely small or large selectivity + * estimates. */ - if (flag & SEL_RIGHT) - result = (val < low) ? 0.001 : 0.999; - else - result = (val < low) ? 0.999 : 0.001; - } - else - { - denominator = high - low; - if (flag & SEL_RIGHT) - numerator = val - low; - else - numerator = high - val; - result = numerator / denominator; + if (hist_selec < 0.001) + hist_selec = 0.001; + else if (hist_selec > 0.999) + hist_selec = 0.999; } + + free_attstatsslot(typid, values, nvalues, NULL, 0); } - PG_RETURN_FLOAT8(result); + + /* + * Now merge the results from the MCV and histogram calculations, + * realizing that the histogram covers only the non-null values that + * are not listed in MCV. + */ + selec = 1.0 - stats->stanullfrac - sumcommon; + + if (hist_selec > 0.0) + selec *= hist_selec; + else + { + /* + * If no histogram but there are values not accounted for by MCV, + * arbitrarily assume half of them will match. + */ + selec *= 0.5; + } + + selec += mcv_selec; + + ReleaseSysCache(statsTuple); + + /* result should be in range, but make sure... */ + if (selec < 0.0) + selec = 0.0; + else if (selec > 1.0) + selec = 1.0; + + PG_RETURN_FLOAT8((float8) selec); } /* @@ -428,34 +602,25 @@ scalargtsel(PG_FUNCTION_ARGS) Datum value = PG_GETARG_DATUM(3); int32 flag = PG_GETARG_INT32(4); Oid ltopid; - float8 result; /* - * Compute selectivity of "<", then invert --- but only if we were - * able to produce a non-default estimate. Note that we get the - * negator which strictly speaking means we are looking at "<=" for - * ">" or "<" for ">=". We assume this won't matter. + * Commute so that we have a "<" or "<=" operator, then apply + * scalarltsel. */ - ltopid = get_negator(opid); - if (ltopid) - { - result = DatumGetFloat8(DirectFunctionCall5(scalarltsel, - ObjectIdGetDatum(ltopid), - ObjectIdGetDatum(relid), - Int16GetDatum(attno), - value, - Int32GetDatum(flag))); - } - else + ltopid = get_commutator(opid); + if (!ltopid) { /* Use default selectivity (should we raise an error instead?) */ - result = DEFAULT_INEQ_SEL; + PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL); } - if (result != DEFAULT_INEQ_SEL) - result = 1.0 - result; - - PG_RETURN_FLOAT8(result); + flag ^= SEL_RIGHT; + return DirectFunctionCall5(scalarltsel, + ObjectIdGetDatum(ltopid), + ObjectIdGetDatum(relid), + Int16GetDatum(attno), + value, + Int32GetDatum(flag)); } /* @@ -476,7 +641,7 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) result = DEFAULT_MATCH_SEL; else { - HeapTuple oprtuple; + HeapTuple oprTuple; Oid ltype, rtype; char *patt; @@ -488,14 +653,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype) * Get left and right datatypes of the operator so we know what * type the attribute is. */ - oprtuple = SearchSysCache(OPEROID, + oprTuple = SearchSysCache(OPEROID, ObjectIdGetDatum(opid), 0, 0, 0); - if (!HeapTupleIsValid(oprtuple)) + if (!HeapTupleIsValid(oprTuple)) elog(ERROR, "patternsel: no tuple for operator %u", opid); - ltype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprleft; - rtype = ((Form_pg_operator) GETSTRUCT(oprtuple))->oprright; - ReleaseSysCache(oprtuple); + ltype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprleft; + rtype = ((Form_pg_operator) GETSTRUCT(oprTuple))->oprright; + ReleaseSysCache(oprTuple); /* the right-hand const is type text for all supported operators */ Assert(rtype == TEXTOID); @@ -659,42 +824,88 @@ eqjoinsel(PG_FUNCTION_ARGS) AttrNumber attno1 = PG_GETARG_INT16(2); Oid relid2 = PG_GETARG_OID(3); AttrNumber attno2 = PG_GETARG_INT16(4); - float8 result; - float8 num1, - num2, - min; bool unknown1 = NONVALUE(relid1) || NONVALUE(attno1); bool unknown2 = NONVALUE(relid2) || NONVALUE(attno2); + double selec; if (unknown1 && unknown2) - result = DEFAULT_EQ_SEL; + selec = DEFAULT_EQ_SEL; else { - num1 = unknown1 ? 1.0 : get_attdispersion(relid1, attno1, 0.01); - num2 = unknown2 ? 1.0 : get_attdispersion(relid2, attno2, 0.01); + Oid typid1; + Oid typid2; + int32 typmod1; + int32 typmod2; + HeapTuple statsTuple1 = NULL; + HeapTuple statsTuple2 = NULL; + Form_pg_statistic stats1 = NULL; + Form_pg_statistic stats2 = NULL; + double nd1, + nd2; + + if (unknown1) + { + nd1 = 100.0; + } + else + { + /* get info about the attribute */ + getattproperties(relid1, attno1, &typid1, &typmod1); + + /* get stats for the attribute, if available */ + statsTuple1 = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid1), + Int16GetDatum(attno1), + 0, 0); + if (HeapTupleIsValid(statsTuple1)) + stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1); + + nd1 = get_att_numdistinct(relid1, attno1, typid1, stats1); + } + + if (unknown2) + { + nd2 = 100.0; + } + else + { + /* get info about the attribute */ + getattproperties(relid2, attno2, &typid2, &typmod2); + + /* get stats for the attribute, if available */ + statsTuple2 = SearchSysCache(STATRELATT, + ObjectIdGetDatum(relid2), + Int16GetDatum(attno2), + 0, 0); + if (HeapTupleIsValid(statsTuple2)) + stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2); + + nd2 = get_att_numdistinct(relid2, attno2, typid2, stats2); + } /* - * The join selectivity cannot be more than num2, since each tuple - * in table 1 could match no more than num2 fraction of tuples in - * table 2 (and that's only if the table-1 tuple matches the most - * common value in table 2, so probably it's less). By the same - * reasoning it is not more than num1. The min is therefore an - * upper bound. + * Estimate the join selectivity as 1 / sqrt(nd1*nd2) + * (can we produce any theory for this)? * - * If we know the dispersion of only one side, use it; the reasoning - * above still works. + * XXX possibility to do better: if both attributes have histograms + * then we could determine the exact join selectivity between the + * MCV sets, and only have to assume the join behavior of the non-MCV + * values. This could be a big win when the MCVs cover a large part + * of the population. * - * XXX can we make a better estimate here? Using the nullfrac - * statistic might be helpful, for example. Assuming the operator - * is strict (does not succeed for null inputs) then the - * selectivity couldn't be more than (1-nullfrac1)*(1-nullfrac2), - * which might be usefully small if there are many nulls. How - * about applying the operator to the most common values? + * XXX what about nulls? */ - min = (num1 < num2) ? num1 : num2; - result = min; + selec = 1.0 / sqrt(nd1 * nd2); + if (selec > 1.0) + selec = 1.0; + + if (HeapTupleIsValid(statsTuple1)) + ReleaseSysCache(statsTuple1); + if (HeapTupleIsValid(statsTuple2)) + ReleaseSysCache(statsTuple2); + } - PG_RETURN_FLOAT8(result); + PG_RETURN_FLOAT8((float8) selec); } /* @@ -829,7 +1040,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS) * Returns "true" if successful. * * All numeric datatypes are simply converted to their equivalent - * "double" values. + * "double" values. XXX what about NUMERIC values that are outside + * the range of "double"? * * String datatypes are converted by convert_string_to_scalar(), * which is explained below. The reason why this routine deals with @@ -917,7 +1129,7 @@ convert_numeric_to_scalar(Datum value, Oid typid) { switch (typid) { - case BOOLOID: + case BOOLOID: return (double) DatumGetBool(value); case INT2OID: return (double) DatumGetInt16(value); @@ -963,6 +1175,8 @@ convert_numeric_to_scalar(Datum value, Oid typid) * three strings before computing the scaled values. This allows us to * "zoom in" when we encounter a narrow data range. An example is a phone * number database where all the values begin with the same area code. + * (Actually, the bounds will be adjacent histogram-bin-boundary values, + * so this is more likely to happen than you might think.) */ static void convert_string_to_scalar(unsigned char *value, @@ -1208,11 +1422,11 @@ convert_timevalue_to_scalar(Datum value, Oid typid) /* * getattproperties * Retrieve pg_attribute properties for an attribute, - * including type OID, type len, type byval flag, typmod. + * including type OID and typmod. */ static void getattproperties(Oid relid, AttrNumber attnum, - Oid *typid, int *typlen, bool *typbyval, int32 *typmod) + Oid *typid, int32 *typmod) { HeapTuple atp; Form_pg_attribute att_tup; @@ -1227,164 +1441,87 @@ getattproperties(Oid relid, AttrNumber attnum, att_tup = (Form_pg_attribute) GETSTRUCT(atp); *typid = att_tup->atttypid; - *typlen = att_tup->attlen; - *typbyval = att_tup->attbyval; *typmod = att_tup->atttypmod; ReleaseSysCache(atp); } /* - * getattstatistics - * Retrieve the pg_statistic data for an attribute. - * Returns 'false' if no stats are available. + * get_att_numdistinct * - * Inputs: - * 'relid' and 'attnum' are the relation and attribute number. - * 'typid' and 'typmod' are the type and typmod of the column, - * which the caller must already have looked up. + * Estimate the number of distinct values of an attribute. * - * Outputs: - * The available stats are nullfrac, commonfrac, commonval, loval, hival. - * The caller need not retrieve all five --- pass NULL pointers for the - * unwanted values. + * relid, attnum: identify the attribute to examine. + * typid: type of attribute. + * stats: pg_statistic tuple for attribute, or NULL if not available. * - * commonval, loval, hival are returned as Datums holding the internal - * representation of the values. (Note that these should be pfree'd - * after use if the data type is not by-value.) + * XXX possible future improvement: look to see if there is a unique + * index on the attribute. If so, we can estimate ndistinct = ntuples. + * This should probably override any info from pg_statistic. */ -static bool -getattstatistics(Oid relid, - AttrNumber attnum, - Oid typid, - int32 typmod, - double *nullfrac, - double *commonfrac, - Datum *commonval, - Datum *loval, - Datum *hival) +static double +get_att_numdistinct(Oid relid, AttrNumber attnum, Oid typid, + Form_pg_statistic stats) { - HeapTuple tuple; - HeapTuple typeTuple; - FmgrInfo inputproc; - Oid typelem; - bool isnull; + HeapTuple reltup; + double ntuples; /* - * We assume that there will only be one entry in pg_statistic for the - * given rel/att, so we search WITHOUT considering the staop column. - * Someday, VACUUM might store more than one entry per rel/att, - * corresponding to more than one possible sort ordering defined for - * the column type. However, to make that work we will need to figure - * out which staop to search for --- it's not necessarily the one we - * have at hand! (For example, we might have a '>' operator rather - * than the '<' operator that will appear in staop.) + * Special-case boolean columns: presumably, two distinct values. + * + * Are there any other cases we should wire in special estimates for? */ - tuple = SearchSysCache(STATRELID, - ObjectIdGetDatum(relid), - Int16GetDatum((int16) attnum), - 0, 0); - if (!HeapTupleIsValid(tuple)) - { - /* no such stats entry */ - return false; - } + if (typid == BOOLOID) + return 2.0; - if (nullfrac) - *nullfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stanullfrac; - if (commonfrac) - *commonfrac = ((Form_pg_statistic) GETSTRUCT(tuple))->stacommonfrac; - - /* Get the type input proc for the column datatype */ - typeTuple = SearchSysCache(TYPEOID, - ObjectIdGetDatum(typid), - 0, 0, 0); - if (!HeapTupleIsValid(typeTuple)) - elog(ERROR, "getattstatistics: Cache lookup failed for type %u", - typid); - fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc); - typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem; - ReleaseSysCache(typeTuple); + /* + * If VACUUM ANALYZE determined a fixed estimate, use it. + */ + if (stats && stats->stadistinct > 0.0) + return stats->stadistinct; /* - * Values are variable-length fields, so cannot access as struct - * fields. Must do it the hard way with SysCacheGetAttr. + * Otherwise we need to get the relation size. */ - if (commonval) - { - Datum val = SysCacheGetAttr(STATRELID, tuple, - Anum_pg_statistic_stacommonval, - &isnull); + reltup = SearchSysCache(RELOID, + ObjectIdGetDatum(relid), + 0, 0, 0); + if (!HeapTupleIsValid(reltup)) + elog(ERROR, "get_att_numdistinct: no relation tuple %u", relid); - if (isnull) - { - elog(DEBUG, "getattstatistics: stacommonval is null"); - *commonval = PointerGetDatum(NULL); - } - else - { - char *strval = DatumGetCString(DirectFunctionCall1(textout, - val)); - - *commonval = FunctionCall3(&inputproc, - CStringGetDatum(strval), - ObjectIdGetDatum(typelem), - Int32GetDatum(typmod)); - pfree(strval); - } - } + ntuples = ((Form_pg_class) GETSTRUCT(reltup))->reltuples; - if (loval) - { - Datum val = SysCacheGetAttr(STATRELID, tuple, - Anum_pg_statistic_staloval, - &isnull); + ReleaseSysCache(reltup); - if (isnull) - { - elog(DEBUG, "getattstatistics: staloval is null"); - *loval = PointerGetDatum(NULL); - } - else - { - char *strval = DatumGetCString(DirectFunctionCall1(textout, - val)); - - *loval = FunctionCall3(&inputproc, - CStringGetDatum(strval), - ObjectIdGetDatum(typelem), - Int32GetDatum(typmod)); - pfree(strval); - } - } + if (ntuples <= 0.0) + return 100.0; /* no data available; return a default */ - if (hival) - { - Datum val = SysCacheGetAttr(STATRELID, tuple, - Anum_pg_statistic_stahival, - &isnull); + /* + * If VACUUM ANALYZE determined a scaled estimate, use it. + */ + if (stats && stats->stadistinct < 0.0) + return - stats->stadistinct * ntuples; - if (isnull) - { - elog(DEBUG, "getattstatistics: stahival is null"); - *hival = PointerGetDatum(NULL); - } - else - { - char *strval = DatumGetCString(DirectFunctionCall1(textout, - val)); - - *hival = FunctionCall3(&inputproc, - CStringGetDatum(strval), - ObjectIdGetDatum(typelem), - Int32GetDatum(typmod)); - pfree(strval); - } + /* + * VACUUM ANALYZE does not compute stats for system attributes, + * but some of them can reasonably be assumed unique anyway. + */ + switch (attnum) + { + case ObjectIdAttributeNumber: + case SelfItemPointerAttributeNumber: + return ntuples; + case TableOidAttributeNumber: + return 1.0; } - ReleaseSysCache(tuple); + /* + * Estimate ndistinct = ntuples if the table is small, else 100. + */ + if (ntuples < 100.0) + return ntuples; - return true; + return 100.0; } /*------------------------------------------------------------------------- diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 82d55866215..3995de5d7a1 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.52 2001/03/23 04:49:55 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.53 2001/05/07 00:43:24 tgl Exp $ * * NOTES * Eventually, the index information should go through here, too. @@ -18,7 +18,10 @@ #include "access/tupmacs.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "utils/array.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/syscache.h" @@ -182,106 +185,6 @@ get_atttypmod(Oid relid, AttrNumber attnum) return -1; } -/* - * get_attdispersion - * - * Retrieve the dispersion statistic for an attribute, - * or produce an estimate if no info is available. - * - * min_estimate is the minimum estimate to return if insufficient data - * is available to produce a reliable value. This value may vary - * depending on context. (For example, when deciding whether it is - * safe to use a hashjoin, we want to be more conservative than when - * estimating the number of tuples produced by an equijoin.) - */ -double -get_attdispersion(Oid relid, AttrNumber attnum, double min_estimate) -{ - HeapTuple atp; - Form_pg_attribute att_tup; - double dispersion; - Oid atttypid; - int32 ntuples; - - atp = SearchSysCache(ATTNUM, - ObjectIdGetDatum(relid), - Int16GetDatum(attnum), - 0, 0); - if (!HeapTupleIsValid(atp)) - { - /* this should not happen */ - elog(ERROR, "get_attdispersion: no attribute tuple %u %d", - relid, attnum); - return min_estimate; - } - - att_tup = (Form_pg_attribute) GETSTRUCT(atp); - - dispersion = att_tup->attdispersion; - atttypid = att_tup->atttypid; - - ReleaseSysCache(atp); - - if (dispersion > 0.0) - return dispersion; /* we have a specific estimate from VACUUM */ - - /* - * Special-case boolean columns: the dispersion of a boolean is highly - * unlikely to be anywhere near 1/numtuples, instead it's probably - * more like 0.5. - * - * Are there any other cases we should wire in special estimates for? - */ - if (atttypid == BOOLOID) - return 0.5; - - /* - * Dispersion is either 0 (no data available) or -1 (dispersion is - * 1/numtuples). Either way, we need the relation size. - */ - - atp = SearchSysCache(RELOID, - ObjectIdGetDatum(relid), - 0, 0, 0); - if (!HeapTupleIsValid(atp)) - { - /* this should not happen */ - elog(ERROR, "get_attdispersion: no relation tuple %u", relid); - return min_estimate; - } - - ntuples = ((Form_pg_class) GETSTRUCT(atp))->reltuples; - - ReleaseSysCache(atp); - - if (ntuples == 0) - return min_estimate; /* no data available */ - - if (dispersion < 0.0) /* VACUUM thinks there are no duplicates */ - return 1.0 / (double) ntuples; - - /* - * VACUUM ANALYZE does not compute dispersion for system attributes, - * but some of them can reasonably be assumed unique anyway. - */ - if (attnum == ObjectIdAttributeNumber || - attnum == SelfItemPointerAttributeNumber) - return 1.0 / (double) ntuples; - if (attnum == TableOidAttributeNumber) - return 1.0; - - /* - * VACUUM ANALYZE has not been run for this table. Produce an estimate - * of 1/numtuples. This may produce unreasonably small estimates for - * large tables, so limit the estimate to no less than min_estimate. - */ - dispersion = 1.0 / (double) ntuples; - if (dispersion < min_estimate) - dispersion = min_estimate; - - return dispersion; -} - /* ---------- INDEX CACHE ---------- */ /* watch this space... @@ -876,3 +779,157 @@ get_typtype(Oid typid) } #endif + +/* ---------- STATISTICS CACHE ---------- */ + +/* + * get_attstatsslot + * + * Extract the contents of a "slot" of a pg_statistic tuple. + * Returns TRUE if requested slot type was found, else FALSE. + * + * Unlike other routines in this file, this takes a pointer to an + * already-looked-up tuple in the pg_statistic cache. We do this since + * most callers will want to extract more than one value from the cache + * entry, and we don't want to repeat the cache lookup unnecessarily. + * + * statstuple: pg_statistics tuple to be examined. + * atttype: type OID of attribute. + * atttypmod: typmod of attribute. + * reqkind: STAKIND code for desired statistics slot kind. + * reqop: STAOP value wanted, or InvalidOid if don't care. + * values, nvalues: if not NULL, the slot's stavalues are extracted. + * numbers, nnumbers: if not NULL, the slot's stanumbers are extracted. + * + * If assigned, values and numbers are set to point to palloc'd arrays. + * If the attribute type is pass-by-reference, the values referenced by + * the values array are themselves palloc'd. The palloc'd stuff can be + * freed by calling free_attstatsslot. + */ +bool +get_attstatsslot(HeapTuple statstuple, + Oid atttype, int32 atttypmod, + int reqkind, Oid reqop, + Datum **values, int *nvalues, + float4 **numbers, int *nnumbers) +{ + Form_pg_statistic stats = (Form_pg_statistic) GETSTRUCT(statstuple); + int i, + j; + Datum val; + bool isnull; + ArrayType *statarray; + int narrayelem; + HeapTuple typeTuple; + FmgrInfo inputproc; + Oid typelem; + + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + if ((&stats->stakind1)[i] == reqkind && + (reqop == InvalidOid || (&stats->staop1)[i] == reqop)) + break; + } + if (i >= STATISTIC_NUM_SLOTS) + return false; /* not there */ + + if (values) + { + val = SysCacheGetAttr(STATRELATT, statstuple, + Anum_pg_statistic_stavalues1 + i, + &isnull); + if (isnull) + elog(ERROR, "get_attstatsslot: stavalues is null"); + statarray = DatumGetArrayTypeP(val); + /* + * Do initial examination of the array. This produces a list + * of text Datums --- ie, pointers into the text array value. + */ + deconstruct_array(statarray, false, -1, 'i', values, nvalues); + narrayelem = *nvalues; + /* + * We now need to replace each text Datum by its internal equivalent. + * + * Get the type input proc and typelem for the column datatype. + */ + typeTuple = SearchSysCache(TYPEOID, + ObjectIdGetDatum(atttype), + 0, 0, 0); + if (!HeapTupleIsValid(typeTuple)) + elog(ERROR, "get_attstatsslot: Cache lookup failed for type %u", + atttype); + fmgr_info(((Form_pg_type) GETSTRUCT(typeTuple))->typinput, &inputproc); + typelem = ((Form_pg_type) GETSTRUCT(typeTuple))->typelem; + ReleaseSysCache(typeTuple); + /* + * Do the conversions. The palloc'd array of Datums is reused + * in place. + */ + for (j = 0; j < narrayelem; j++) + { + char *strval; + + strval = DatumGetCString(DirectFunctionCall1(textout, + (*values)[j])); + (*values)[j] = FunctionCall3(&inputproc, + CStringGetDatum(strval), + ObjectIdGetDatum(typelem), + Int32GetDatum(atttypmod)); + pfree(strval); + } + /* + * Free statarray if it's a detoasted copy. + */ + if ((Pointer) statarray != DatumGetPointer(val)) + pfree(statarray); + } + + if (numbers) + { + val = SysCacheGetAttr(STATRELATT, statstuple, + Anum_pg_statistic_stanumbers1 + i, + &isnull); + if (isnull) + elog(ERROR, "get_attstatsslot: stanumbers is null"); + statarray = DatumGetArrayTypeP(val); + /* + * We expect the array to be a 1-D float4 array; verify that. + * We don't need to use deconstruct_array() since the array + * data is just going to look like a C array of float4 values. + */ + narrayelem = ARR_DIMS(statarray)[0]; + if (ARR_NDIM(statarray) != 1 || narrayelem <= 0 || + ARR_SIZE(statarray) != (ARR_OVERHEAD(1) + narrayelem * sizeof(float4))) + elog(ERROR, "get_attstatsslot: stanumbers is bogus"); + *numbers = (float4 *) palloc(narrayelem * sizeof(float4)); + memcpy(*numbers, ARR_DATA_PTR(statarray), narrayelem * sizeof(float4)); + *nnumbers = narrayelem; + /* + * Free statarray if it's a detoasted copy. + */ + if ((Pointer) statarray != DatumGetPointer(val)) + pfree(statarray); + } + + return true; +} + +void +free_attstatsslot(Oid atttype, + Datum *values, int nvalues, + float4 *numbers, int nnumbers) +{ + if (values) + { + if (! get_typbyval(atttype)) + { + int i; + + for (i = 0; i < nvalues; i++) + pfree(DatumGetPointer(values[i])); + } + pfree(values); + } + if (numbers) + pfree(numbers); +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 75ef3179202..4e35b3fb35b 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/cache/syscache.c,v 1.60 2001/03/22 03:59:57 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/cache/syscache.c,v 1.61 2001/05/07 00:43:24 tgl Exp $ * * NOTES * These routines allow the parser/planner/executor to perform @@ -313,7 +313,7 @@ static struct cachedesc cacheinfo[] = { 0, 0 }}, - {StatisticRelationName, /* STATRELID */ + {StatisticRelationName, /* STATRELATT */ StatisticRelidAttnumIndex, 2, { diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index d27bfb29668..5a77c47c200 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -78,7 +78,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.15 2001/03/23 04:49:55 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.16 2001/05/07 00:43:24 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -87,7 +87,11 @@ #include "access/heapam.h" #include "access/nbtree.h" +#include "catalog/catname.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_amproc.h" #include "miscadmin.h" +#include "utils/fmgroids.h" #include "utils/logtape.h" #include "utils/lsyscache.h" #include "utils/tuplesort.h" @@ -263,6 +267,7 @@ struct Tuplesortstate TupleDesc tupDesc; int nKeys; ScanKey scanKeys; + SortFunctionKind *sortFnKinds; /* * These variables are specific to the IndexTuple case; they are set @@ -279,6 +284,7 @@ struct Tuplesortstate Oid datumType; Oid sortOperator; FmgrInfo sortOpFn; /* cached lookup data for sortOperator */ + SortFunctionKind sortFnKind; /* we need typelen and byval in order to know how to copy the Datums. */ int datumTypeLen; bool datumTypeByVal; @@ -458,14 +464,14 @@ tuplesort_begin_common(bool randomAccess) Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, - int nkeys, ScanKey keys, + int nkeys, + Oid *sortOperators, AttrNumber *attNums, bool randomAccess) { Tuplesortstate *state = tuplesort_begin_common(randomAccess); + int i; - AssertArg(nkeys >= 1); - AssertArg(keys[0].sk_attno != 0); - AssertArg(keys[0].sk_procedure != 0); + AssertArg(nkeys > 0); state->comparetup = comparetup_heap; state->copytup = copytup_heap; @@ -475,7 +481,29 @@ tuplesort_begin_heap(TupleDesc tupDesc, state->tupDesc = tupDesc; state->nKeys = nkeys; - state->scanKeys = keys; + state->scanKeys = (ScanKey) palloc(nkeys * sizeof(ScanKeyData)); + MemSet(state->scanKeys, 0, nkeys * sizeof(ScanKeyData)); + state->sortFnKinds = (SortFunctionKind *) + palloc(nkeys * sizeof(SortFunctionKind)); + MemSet(state->sortFnKinds, 0, nkeys * sizeof(SortFunctionKind)); + + for (i = 0; i < nkeys; i++) + { + RegProcedure sortFunction; + + AssertArg(sortOperators[i] != 0); + AssertArg(attNums[i] != 0); + + /* select a function that implements the sort operator */ + SelectSortFunction(sortOperators[i], &sortFunction, + &state->sortFnKinds[i]); + + ScanKeyEntryInitialize(&state->scanKeys[i], + 0x0, + attNums[i], + sortFunction, + (Datum) 0); + } return state; } @@ -507,6 +535,7 @@ tuplesort_begin_datum(Oid datumType, bool randomAccess) { Tuplesortstate *state = tuplesort_begin_common(randomAccess); + RegProcedure sortFunction; int16 typlen; bool typbyval; @@ -518,8 +547,12 @@ tuplesort_begin_datum(Oid datumType, state->datumType = datumType; state->sortOperator = sortOperator; - /* lookup the function that implements the sort operator */ - fmgr_info(get_opcode(sortOperator), &state->sortOpFn); + + /* select a function that implements the sort operator */ + SelectSortFunction(sortOperator, &sortFunction, &state->sortFnKind); + /* and look up the function */ + fmgr_info(sortFunction, &state->sortOpFn); + /* lookup necessary attributes of the datum type */ get_typlenbyval(datumType, &typlen, &typbyval); state->datumTypeLen = typlen; @@ -548,6 +581,13 @@ tuplesort_end(Tuplesortstate *state) } if (state->memtupindex) pfree(state->memtupindex); + + /* this stuff might better belong in a variant-specific shutdown routine */ + if (state->scanKeys) + pfree(state->scanKeys); + if (state->sortFnKinds) + pfree(state->sortFnKinds); + pfree(state); } @@ -1692,6 +1732,7 @@ comparetup_heap(Tuplesortstate *state, const void *a, const void *b) for (nkey = 0; nkey < state->nKeys; nkey++) { ScanKey scanKey = state->scanKeys + nkey; + SortFunctionKind fnKind = state->sortFnKinds[nkey]; AttrNumber attno = scanKey->sk_attno; Datum lattr, rattr; @@ -1708,23 +1749,36 @@ comparetup_heap(Tuplesortstate *state, const void *a, const void *b) } else if (isnull2) return -1; - else if (scanKey->sk_flags & SK_COMMUTE) - { - if (DatumGetBool(FunctionCall2(&scanKey->sk_func, - rattr, lattr))) - return -1; /* a < b after commute */ - if (DatumGetBool(FunctionCall2(&scanKey->sk_func, - lattr, rattr))) - return 1; /* a > b after commute */ - } else { - if (DatumGetBool(FunctionCall2(&scanKey->sk_func, - lattr, rattr))) - return -1; /* a < b */ - if (DatumGetBool(FunctionCall2(&scanKey->sk_func, - rattr, lattr))) - return 1; /* a > b */ + int32 compare; + + if (fnKind == SORTFUNC_LT) + { + if (DatumGetBool(FunctionCall2(&scanKey->sk_func, + lattr, rattr))) + compare = -1; /* a < b */ + else if (DatumGetBool(FunctionCall2(&scanKey->sk_func, + rattr, lattr))) + compare = 1; /* a > b */ + else + compare = 0; + } + else + { + /* sort function is CMP or REVCMP */ + compare = DatumGetInt32(FunctionCall2(&scanKey->sk_func, + lattr, rattr)); + if (fnKind == SORTFUNC_REVCMP) + compare = -compare; + } + + if (compare != 0) + { + if (scanKey->sk_flags & SK_COMMUTE) + compare = -compare; + return compare; + } } } @@ -1852,8 +1906,10 @@ comparetup_index(Tuplesortstate *state, const void *a, const void *b) } else { + /* the comparison function is always of CMP type */ compare = DatumGetInt32(FunctionCall2(&entry->sk_func, - attrDatum1, attrDatum2)); + attrDatum1, + attrDatum2)); } if (compare != 0) @@ -1954,7 +2010,7 @@ comparetup_datum(Tuplesortstate *state, const void *a, const void *b) } else if (rtup->isNull) return -1; - else + else if (state->sortFnKind == SORTFUNC_LT) { if (DatumGetBool(FunctionCall2(&state->sortOpFn, ltup->val, rtup->val))) @@ -1964,6 +2020,17 @@ comparetup_datum(Tuplesortstate *state, const void *a, const void *b) return 1; /* a > b */ return 0; } + else + { + /* sort function is CMP or REVCMP */ + int32 compare; + + compare = DatumGetInt32(FunctionCall2(&state->sortOpFn, + ltup->val, rtup->val)); + if (state->sortFnKind == SORTFUNC_REVCMP) + compare = -compare; + return compare; + } } static void * @@ -2032,3 +2099,119 @@ tuplesize_datum(Tuplesortstate *state, void *tup) return (unsigned int) tuplelen; } } + + +/* + * This routine selects an appropriate sorting function to implement + * a sort operator as efficiently as possible. The straightforward + * method is to use the operator's implementation proc --- ie, "<" + * comparison. However, that way often requires two calls of the function + * per comparison. If we can find a btree three-way comparator function + * associated with the operator, we can use it to do the comparisons + * more efficiently. We also support the possibility that the operator + * is ">" (descending sort), in which case we have to reverse the output + * of the btree comparator. + * + * Possibly this should live somewhere else (backend/catalog/, maybe?). + */ +void +SelectSortFunction(Oid sortOperator, + RegProcedure *sortFunction, + SortFunctionKind *kind) +{ + Relation relation; + HeapScanDesc scan; + ScanKeyData skey[3]; + HeapTuple tuple; + Oid opclass = InvalidOid; + + /* + * Scan pg_amop to see if the target operator is registered as the + * "<" or ">" operator of any btree opclass. It's possible that it + * might be registered both ways (eg, if someone were to build a + * "reverse sort" opclass for some reason); prefer the "<" case if so. + * If the operator is registered the same way in multiple opclasses, + * assume we can use the associated comparator function from any one. + */ + relation = heap_openr(AccessMethodOperatorRelationName, + AccessShareLock); + + ScanKeyEntryInitialize(&skey[0], 0, + Anum_pg_amop_amopid, + F_OIDEQ, + ObjectIdGetDatum(BTREE_AM_OID)); + + ScanKeyEntryInitialize(&skey[1], 0, + Anum_pg_amop_amopopr, + F_OIDEQ, + ObjectIdGetDatum(sortOperator)); + + scan = heap_beginscan(relation, false, SnapshotNow, 2, skey); + + while (HeapTupleIsValid(tuple = heap_getnext(scan, 0))) + { + Form_pg_amop aform = (Form_pg_amop) GETSTRUCT(tuple); + + if (aform->amopstrategy == BTLessStrategyNumber) + { + opclass = aform->amopclaid; + *kind = SORTFUNC_CMP; + break; /* done looking */ + } + else if (aform->amopstrategy == BTGreaterStrategyNumber) + { + opclass = aform->amopclaid; + *kind = SORTFUNC_REVCMP; + /* keep scanning in hopes of finding a BTLess entry */ + } + } + + heap_endscan(scan); + heap_close(relation, AccessShareLock); + + if (OidIsValid(opclass)) + { + /* Found a suitable opclass, get its comparator support function */ + relation = heap_openr(AccessMethodProcedureRelationName, + AccessShareLock); + + ScanKeyEntryInitialize(&skey[0], 0, + Anum_pg_amproc_amid, + F_OIDEQ, + ObjectIdGetDatum(BTREE_AM_OID)); + + ScanKeyEntryInitialize(&skey[1], 0, + Anum_pg_amproc_amopclaid, + F_OIDEQ, + ObjectIdGetDatum(opclass)); + + ScanKeyEntryInitialize(&skey[2], 0, + Anum_pg_amproc_amprocnum, + F_INT2EQ, + Int16GetDatum(BTORDER_PROC)); + + scan = heap_beginscan(relation, false, SnapshotNow, 3, skey); + + *sortFunction = InvalidOid; + + if (HeapTupleIsValid(tuple = heap_getnext(scan, 0))) + { + Form_pg_amproc aform = (Form_pg_amproc) GETSTRUCT(tuple); + *sortFunction = aform->amproc; + } + + heap_endscan(scan); + heap_close(relation, AccessShareLock); + + if (RegProcedureIsValid(*sortFunction)) + return; + } + + /* Can't find a comparator, so use the operator as-is */ + + *kind = SORTFUNC_LT; + *sortFunction = get_opcode(sortOperator); + if (!RegProcedureIsValid(*sortFunction)) + elog(ERROR, "SelectSortFunction: operator %u has no implementation", + sortOperator); +} |