aboutsummaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/lib/hyperloglog.h67
-rw-r--r--src/include/pg_config_manual.h14
-rw-r--r--src/include/utils/sortsupport.h134
3 files changed, 206 insertions, 9 deletions
diff --git a/src/include/lib/hyperloglog.h b/src/include/lib/hyperloglog.h
new file mode 100644
index 00000000000..a6cbffc4c32
--- /dev/null
+++ b/src/include/lib/hyperloglog.h
@@ -0,0 +1,67 @@
+/*
+ * hyperloglog.h
+ *
+ * A simple HyperLogLog cardinality estimator implementation
+ *
+ * Portions Copyright (c) 2014, PostgreSQL Global Development Group
+ *
+ * Based on Hideaki Ohno's C++ implementation. The copyright terms of Ohno's
+ * original version (the MIT license) follow.
+ *
+ * src/include/lib/hyperloglog.h
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HYPERLOGLOG_H
+#define HYPERLOGLOG_H
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set. Importantly, it does this by using a fixed amount of
+ * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * hyperLogLogState
+ *
+ * registerWidth register width, in bits ("k")
+ * nRegisters number of registers
+ * alphaMM alpha * m ^ 2 (see initHyperLogLog())
+ * hashesArr array of hashes
+ * arrSize size of hashesArr
+ */
+typedef struct hyperLogLogState
+{
+ uint8 registerWidth;
+ Size nRegisters;
+ double alphaMM;
+ uint8 *hashesArr;
+ Size arrSize;
+} hyperLogLogState;
+
+extern void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth);
+extern void addHyperLogLog(hyperLogLogState *cState, uint32 hash);
+extern double estimateHyperLogLog(hyperLogLogState *cState);
+extern void mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState);
+
+#endif /* HYPERLOGLOG_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index cb35697ad07..5cfc0ae1e80 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -214,13 +214,13 @@
#endif
/*
- * Assumed cache line size. This doesn't affect correctness, but can be
- * used for low-level optimizations. Currently, this is only used to pad
- * some data structures in xlog.c, to ensure that highly-contended fields
- * are on different cache lines. Too small a value can hurt performance due
- * to false sharing, while the only downside of too large a value is a few
- * bytes of wasted memory. The default is 128, which should be large enough
- * for all supported platforms.
+ * Assumed cache line size. This doesn't affect correctness, but can be used
+ * for low-level optimizations. Currently, this is used to pad some data
+ * structures in xlog.c, to ensure that highly-contended fields are on
+ * different cache lines. Too small a value can hurt performance due to false
+ * sharing, while the only downside of too large a value is a few bytes of
+ * wasted memory. The default is 128, which should be large enough for all
+ * supported platforms.
*/
#define PG_CACHE_LINE_SIZE 128
diff --git a/src/include/utils/sortsupport.h b/src/include/utils/sortsupport.h
index f379bb47029..62fedfaaad4 100644
--- a/src/include/utils/sortsupport.h
+++ b/src/include/utils/sortsupport.h
@@ -21,7 +21,12 @@
* required to provide all of them. The BTSORTSUPPORT function should
* simply not set any function pointers for mechanisms it doesn't support.
* Opclasses that provide BTSORTSUPPORT and don't provide a comparator
- * function will have a shim set up by sort support automatically.
+ * function will have a shim set up by sort support automatically. However,
+ * opclasses that support the optional additional abbreviated key capability
+ * must always provide an authoritative comparator used to tie-break
+ * inconclusive abbreviated comparisons and also used when aborting
+ * abbreviation. Furthermore, a converter and abort/costing function must be
+ * provided.
*
* All sort support functions will be passed the address of the
* SortSupportData struct when called, so they can use it to store
@@ -93,12 +98,96 @@ typedef struct SortSupportData
* than, equal to, or greater than y. Note that x and y are guaranteed
* not null, and there is no way to return null either. Do not return
* INT_MIN, as callers are allowed to negate the result before using it.
+ *
+ * This may be either the authoritative comparator, or the abbreviated
+ * comparator. Core code may switch this over the initial preference of an
+ * opclass support function despite originally indicating abbreviation was
+ * applicable, by assigning the authoritative comparator back.
*/
int (*comparator) (Datum x, Datum y, SortSupport ssup);
/*
- * Additional sort-acceleration functions might be added here later.
+ * "Abbreviated key" infrastructure follows.
+ *
+ * All callbacks must be set by sortsupport opclasses that make use of this
+ * optional additional infrastructure (unless for whatever reasons the
+ * opclass doesn't proceed with abbreviation, in which case
+ * abbrev_converter must not be set).
+ *
+ * This allows opclass authors to supply a conversion routine, used to
+ * create an alternative representation of the underlying type (an
+ * "abbreviated key"). Typically, this representation is an ad-hoc,
+ * pass-by-value Datum format that only the opclass has knowledge of. An
+ * alternative comparator, used only with this alternative representation
+ * must also be provided (which is assigned to "comparator"). This
+ * representation is a simple approximation of the original Datum. It must
+ * be possible to compare datums of this representation with each other
+ * using the supplied alternative comparator, and have any non-zero return
+ * value be a reliable proxy for what a proper comparison would indicate.
+ * Returning zero from the alternative comparator does not indicate
+ * equality, as with a conventional support routine 1, though -- it
+ * indicates that it wasn't possible to determine how the two abbreviated
+ * values compared. A proper comparison, using "abbrev_full_comparator"/
+ * ApplySortAbbrevFullComparator() is therefore required. In many cases
+ * this results in most or all comparisons only using the cheap alternative
+ * comparison func, which is typically implemented as code that compiles to
+ * just a few CPU instructions. CPU cache miss penalties are expensive; to
+ * get good overall performance, sort infrastructure must heavily weigh
+ * cache performance.
+ *
+ * Opclass authors must consider the final cardinality of abbreviated keys
+ * when devising an encoding scheme. It's possible for a strategy to work
+ * better than an alternative strategy with one usage pattern, while the
+ * reverse might be true for another usage pattern. All of these factors
+ * must be considered.
*/
+
+ /*
+ * "abbreviate" concerns whether or not the abbreviated key optimization is
+ * applicable in principle (that is, the sortsupport routine needs to know
+ * if its dealing with a key where an abbreviated representation can
+ * usefully be packed together. Conventionally, this is the leading
+ * attribute key). Note, however, that in order to determine that
+ * abbreviation is not in play, the core code always checks whether or not
+ * the opclass has set abbrev_converter. This is a one way, one time
+ * message to the opclass.
+ */
+ bool abbreviate;
+
+ /*
+ * Converter to abbreviated format, from original representation. Core
+ * code uses this callback to convert from a pass-by-reference "original"
+ * Datum to a pass-by-value abbreviated key Datum. Note that original is
+ * guaranteed NOT NULL, because it doesn't make sense to factor NULLness
+ * into ad-hoc cost model.
+ *
+ * abbrev_converter is tested to see if abbreviation is in play. Core code
+ * may set it to NULL to indicate abbreviation should not be used (which is
+ * something sortsupport routines need not concern themselves with).
+ * However, sortsupport routines must not set it when it is immediately
+ * established that abbreviation should not proceed (for abbreviation
+ * calls, or platform-specific impediments to using abbreviation).
+ */
+ Datum (*abbrev_converter) (Datum original, SortSupport ssup);
+
+ /*
+ * abbrev_abort callback allows clients to verify that the current strategy
+ * is working out, using a sortsupport routine defined ad-hoc cost model.
+ * If there is a lot of duplicate abbreviated keys in practice, it's useful
+ * to be able to abandon the strategy before paying too high a cost in
+ * conversion (perhaps certain opclass-specific adaptations are useful
+ * too).
+ */
+ bool (*abbrev_abort) (int memtupcount, SortSupport ssup);
+
+ /*
+ * Full, authoritative comparator for key that an abbreviated
+ * representation was generated for, used when an abbreviated comparison
+ * was inconclusive (by calling ApplySortComparatorFull()), or used to
+ * replace "comparator" when core system ultimately decides against
+ * abbreviation.
+ */
+ int (*abbrev_full_comparator) (Datum x, Datum y, SortSupport ssup);
} SortSupportData;
@@ -110,6 +199,9 @@ typedef struct SortSupportData
extern int ApplySortComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup);
+extern int ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+ Datum datum2, bool isNull2,
+ SortSupport ssup);
#endif /* !PG_USE_INLINE */
#if defined(PG_USE_INLINE) || defined(SORTSUPPORT_INCLUDE_DEFINITIONS)
/*
@@ -148,6 +240,44 @@ ApplySortComparator(Datum datum1, bool isNull1,
return compare;
}
+
+/*
+ * Apply a sort comparator function and return a 3-way comparison using full,
+ * authoritative comparator. This takes care of handling reverse-sort and
+ * NULLs-ordering properly.
+ */
+STATIC_IF_INLINE int
+ApplySortAbbrevFullComparator(Datum datum1, bool isNull1,
+ Datum datum2, bool isNull2,
+ SortSupport ssup)
+{
+ int compare;
+
+ if (isNull1)
+ {
+ if (isNull2)
+ compare = 0; /* NULL "=" NULL */
+ else if (ssup->ssup_nulls_first)
+ compare = -1; /* NULL "<" NOT_NULL */
+ else
+ compare = 1; /* NULL ">" NOT_NULL */
+ }
+ else if (isNull2)
+ {
+ if (ssup->ssup_nulls_first)
+ compare = 1; /* NOT_NULL ">" NULL */
+ else
+ compare = -1; /* NOT_NULL "<" NULL */
+ }
+ else
+ {
+ compare = (*ssup->abbrev_full_comparator) (datum1, datum2, ssup);
+ if (ssup->ssup_reverse)
+ compare = -compare;
+ }
+
+ return compare;
+}
#endif /*-- PG_USE_INLINE || SORTSUPPORT_INCLUDE_DEFINITIONS */
/* Other functions in utils/sort/sortsupport.c */