diff options
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/lib/hyperloglog.h | 67 | ||||
-rw-r--r-- | src/include/pg_config_manual.h | 14 | ||||
-rw-r--r-- | src/include/utils/sortsupport.h | 134 |
3 files changed, 206 insertions, 9 deletions
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h new file mode 100644 index 00000000000..a6cbffc4c32 --- /dev/null +++ b/src/include/lib/hyperloglog.h @@ -0,0 +1,67 @@ +/* + * hyperloglog.h + * + * A simple HyperLogLog cardinality estimator implementation + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. The copyright terms of Ohno's + * original version (the MIT license) follow. + * + * src/include/lib/hyperloglog.h + */ + +/* + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HYPERLOGLOG_H +#define HYPERLOGLOG_H + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * hyperLogLogState + * + * registerWidth register width, in bits ("k") + * nRegisters number of registers + * alphaMM alpha * m ^ 2 (see initHyperLogLog()) + * hashesArr array of hashes + * arrSize size of hashesArr + */ +typedef struct hyperLogLogState +{ + uint8 registerWidth; + Size nRegisters; + double alphaMM; + uint8 *hashesArr; + Size arrSize; +} hyperLogLogState; + +extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth); +extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash); +extern double estimateHyperLogLog(hyperLogLogState *cState); +extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState); + +#endif /* HYPERLOGLOG_H */ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index cb35697ad07..5cfc0ae1e80 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -214,13 +214,13 @@ #endif /* - * Assumed cache line size. This doesn't affect correctness, but can be - * used for low-level optimizations. Currently, this is only used to pad - * some data structures in xlog.c, to ensure that highly-contended fields - * are on different cache lines. Too small a value can hurt performance due - * to false sharing, while the only downside of too large a value is a few - * bytes of wasted memory. The default is 128, which should be large enough - * for all supported platforms. + * Assumed cache line size. This doesn't affect correctness, but can be used + * for low-level optimizations. Currently, this is used to pad some data + * structures in xlog.c, to ensure that highly-contended fields are on + * different cache lines. Too small a value can hurt performance due to false + * sharing, while the only downside of too large a value is a few bytes of + * wasted memory. The default is 128, which should be large enough for all + * supported platforms. */ #define PG_CACHE_LINE_SIZE 128 diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h index f379bb47029..62fedfaaad4 100644 --- a/src/include/utils/sortsupport.h +++ b/src/include/utils/sortsupport.h @@ -21,7 +21,12 @@ * required to provide all of them. The BTSORTSUPPORT function should * simply not set any function pointers for mechanisms it doesn't support. * Opclasses that provide BTSORTSUPPORT and don't provide a comparator - * function will have a shim set up by sort support automatically. + * function will have a shim set up by sort support automatically. However, + * opclasses that support the optional additional abbreviated key capability + * must always provide an authoritative comparator used to tie-break + * inconclusive abbreviated comparisons and also used when aborting + * abbreviation. Furthermore, a converter and abort/costing function must be + * provided. * * All sort support functions will be passed the address of the * SortSupportData struct when called, so they can use it to store @@ -93,12 +98,96 @@ typedef struct SortSupportData * than, equal to, or greater than y. Note that x and y are guaranteed * not null, and there is no way to return null either. Do not return * INT_MIN, as callers are allowed to negate the result before using it. + * + * This may be either the authoritative comparator, or the abbreviated + * comparator. Core code may switch this over the initial preference of an + * opclass support function despite originally indicating abbreviation was + * applicable, by assigning the authoritative comparator back. */ int (*comparator) (Datum x, Datum y, SortSupport ssup); /* - * Additional sort-acceleration functions might be added here later. + * "Abbreviated key" infrastructure follows. + * + * All callbacks must be set by sortsupport opclasses that make use of this + * optional additional infrastructure (unless for whatever reasons the + * opclass doesn't proceed with abbreviation, in which case + * abbrev_converter must not be set). + * + * This allows opclass authors to supply a conversion routine, used to + * create an alternative representation of the underlying type (an + * "abbreviated key"). Typically, this representation is an ad-hoc, + * pass-by-value Datum format that only the opclass has knowledge of. An + * alternative comparator, used only with this alternative representation + * must also be provided (which is assigned to "comparator"). This + * representation is a simple approximation of the original Datum. It must + * be possible to compare datums of this representation with each other + * using the supplied alternative comparator, and have any non-zero return + * value be a reliable proxy for what a proper comparison would indicate. + * Returning zero from the alternative comparator does not indicate + * equality, as with a conventional support routine 1, though -- it + * indicates that it wasn't possible to determine how the two abbreviated + * values compared. A proper comparison, using "abbrev_full_comparator"/ + * ApplySortAbbrevFullComparator() is therefore required. In many cases + * this results in most or all comparisons only using the cheap alternative + * comparison func, which is typically implemented as code that compiles to + * just a few CPU instructions. CPU cache miss penalties are expensive; to + * get good overall performance, sort infrastructure must heavily weigh + * cache performance. + * + * Opclass authors must consider the final cardinality of abbreviated keys + * when devising an encoding scheme. It's possible for a strategy to work + * better than an alternative strategy with one usage pattern, while the + * reverse might be true for another usage pattern. All of these factors + * must be considered. */ + + /* + * "abbreviate" concerns whether or not the abbreviated key optimization is + * applicable in principle (that is, the sortsupport routine needs to know + * if its dealing with a key where an abbreviated representation can + * usefully be packed together. Conventionally, this is the leading + * attribute key). Note, however, that in order to determine that + * abbreviation is not in play, the core code always checks whether or not + * the opclass has set abbrev_converter. This is a one way, one time + * message to the opclass. + */ + bool abbreviate; + + /* + * Converter to abbreviated format, from original representation. Core + * code uses this callback to convert from a pass-by-reference "original" + * Datum to a pass-by-value abbreviated key Datum. Note that original is + * guaranteed NOT NULL, because it doesn't make sense to factor NULLness + * into ad-hoc cost model. + * + * abbrev_converter is tested to see if abbreviation is in play. Core code + * may set it to NULL to indicate abbreviation should not be used (which is + * something sortsupport routines need not concern themselves with). + * However, sortsupport routines must not set it when it is immediately + * established that abbreviation should not proceed (for abbreviation + * calls, or platform-specific impediments to using abbreviation). + */ + Datum (*abbrev_converter) (Datum original, SortSupport ssup); + + /* + * abbrev_abort callback allows clients to verify that the current strategy + * is working out, using a sortsupport routine defined ad-hoc cost model. + * If there is a lot of duplicate abbreviated keys in practice, it's useful + * to be able to abandon the strategy before paying too high a cost in + * conversion (perhaps certain opclass-specific adaptations are useful + * too). + */ + bool (*abbrev_abort) (int memtupcount, SortSupport ssup); + + /* + * Full, authoritative comparator for key that an abbreviated + * representation was generated for, used when an abbreviated comparison + * was inconclusive (by calling ApplySortComparatorFull()), or used to + * replace "comparator" when core system ultimately decides against + * abbreviation. + */ + int (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup); } SortSupportData; @@ -110,6 +199,9 @@ typedef struct SortSupportData extern int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup); +extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, + Datum datum2, bool isNull2, + SortSupport ssup); #endif /* !PG_USE_INLINE */ #if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS) /* @@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1, return compare; } + +/* + * Apply a sort comparator function and return a 3-way comparison using full, + * authoritative comparator. This takes care of handling reverse-sort and + * NULLs-ordering properly. + */ +STATIC_IF_INLINE int +ApplySortAbbrevFullComparator(Datum datum1, bool isNull1, + Datum datum2, bool isNull2, + SortSupport ssup) +{ + int compare; + + if (isNull1) + { + if (isNull2) + compare = 0; /* NULL "=" NULL */ + else if (ssup->ssup_nulls_first) + compare = -1; /* NULL "<" NOT_NULL */ + else + compare = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull2) + { + if (ssup->ssup_nulls_first) + compare = 1; /* NOT_NULL ">" NULL */ + else + compare = -1; /* NOT_NULL "<" NULL */ + } + else + { + compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup); + if (ssup->ssup_reverse) + compare = -compare; + } + + return compare; +} #endif /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */ /* Other functions in utils/sort/sortsupport.c */ |