aboutsummaryrefslogtreecommitdiff
path: root/src/interfaces/ecpg/preproc
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2019-01-09 19:47:38 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2019-01-09 19:47:46 -0500
commitc64d0cd5ce24a344798534f1bc5827a9199b7a6e (patch)
tree3968456d54c3f18d07976e5a139ca60589a8fbf0 /src/interfaces/ecpg/preproc
parent5d59a6c5eaff4a58322683e450e76a11d943d322 (diff)
downloadpostgresql-c64d0cd5ce24a344798534f1bc5827a9199b7a6e.tar.gz
postgresql-c64d0cd5ce24a344798534f1bc5827a9199b7a6e.zip
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup ought to be faster than binary search, but up to now we hadn't found a suitable tool for generating the hash function. Joerg Sonnenberger provided the inspiration, and sample code, to show us that rolling our own generator wasn't a ridiculous idea. Hence, do that. The method used here requires a lookup table of approximately 4 bytes per keyword, but that's less than what we saved in the predecessor commit afb0d0712, so it's not a big problem. The time savings is indeed significant: preliminary testing suggests that the total time for raw parsing (flex + bison phases) drops by ~20%. Patch by me, but it owes its existence to Joerg Sonnenberger; thanks also to John Naylor for review. Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
Diffstat (limited to 'src/interfaces/ecpg/preproc')
-rw-r--r--src/interfaces/ecpg/preproc/Makefile13
-rw-r--r--src/interfaces/ecpg/preproc/c_keywords.c51
-rw-r--r--src/interfaces/ecpg/preproc/c_kwlist.h3
-rw-r--r--src/interfaces/ecpg/preproc/ecpg_kwlist.h3
4 files changed, 34 insertions, 36 deletions
diff --git a/src/interfaces/ecpg/preproc/Makefile b/src/interfaces/ecpg/preproc/Makefile
index b5b74a3b81e..20e3b478746 100644
--- a/src/interfaces/ecpg/preproc/Makefile
+++ b/src/interfaces/ecpg/preproc/Makefile
@@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \
keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
$(WIN32RES)
-GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
+# where to find gen_keywordlist.pl and subsidiary files
+TOOLSDIR = $(top_srcdir)/src/tools
+GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
+GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
# Suppress parallel build to avoid a bug in GNU make 3.82
# (see comments in ../Makefile)
@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
# generate keyword headers
-c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
- $(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
+c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+ $(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
-ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
- $(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
+ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
+ $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
# Force these dependencies to be known even without dependency info built:
ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h
diff --git a/src/interfaces/ecpg/preproc/c_keywords.c b/src/interfaces/ecpg/preproc/c_keywords.c
index 38ddf6f1359..80aa7d5339c 100644
--- a/src/interfaces/ecpg/preproc/c_keywords.c
+++ b/src/interfaces/ecpg/preproc/c_keywords.c
@@ -9,8 +9,6 @@
*/
#include "postgres_fe.h"
-#include <ctype.h>
-
#include "preproc_extern.h"
#include "preproc.h"
@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
*
* Returns the token value of the keyword, or -1 if no match.
*
- * Do a binary search using plain strcmp() comparison. This is much like
+ * Do a hash search using plain strcmp() comparison. This is much like
* ScanKeywordLookup(), except we want case-sensitive matching.
*/
int
-ScanCKeywordLookup(const char *text)
+ScanCKeywordLookup(const char *str)
{
- const char *kw_string;
- const uint16 *kw_offsets;
- const uint16 *low;
- const uint16 *high;
+ size_t len;
+ int h;
+ const char *kw;
+
+ /*
+ * Reject immediately if too long to be any keyword. This saves useless
+ * hashing work on long strings.
+ */
+ len = strlen(str);
+ if (len > ScanCKeywords.max_kw_len)
+ return -1;
- if (strlen(text) > ScanCKeywords.max_kw_len)
- return -1; /* too long to be any keyword */
+ /*
+ * Compute the hash function. Since it's a perfect hash, we need only
+ * match to the specific keyword it identifies.
+ */
+ h = ScanCKeywords_hash_func(str, len);
- kw_string = ScanCKeywords.kw_string;
- kw_offsets = ScanCKeywords.kw_offsets;
- low = kw_offsets;
- high = kw_offsets + (ScanCKeywords.num_keywords - 1);
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= ScanCKeywords.num_keywords)
+ return -1;
- while (low <= high)
- {
- const uint16 *middle;
- int difference;
+ kw = GetScanKeyword(h, &ScanCKeywords);
- middle = low + (high - low) / 2;
- difference = strcmp(kw_string + *middle, text);
- if (difference == 0)
- return ScanCKeywordTokens[middle - kw_offsets];
- else if (difference < 0)
- low = middle + 1;
- else
- high = middle - 1;
- }
+ if (strcmp(kw, str) == 0)
+ return ScanCKeywordTokens[h];
return -1;
}
diff --git a/src/interfaces/ecpg/preproc/c_kwlist.h b/src/interfaces/ecpg/preproc/c_kwlist.h
index 45455052982..610a4b1e053 100644
--- a/src/interfaces/ecpg/preproc/c_kwlist.h
+++ b/src/interfaces/ecpg/preproc/c_kwlist.h
@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- * search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */
diff --git a/src/interfaces/ecpg/preproc/ecpg_kwlist.h b/src/interfaces/ecpg/preproc/ecpg_kwlist.h
index 97ef254166d..bdd98549254 100644
--- a/src/interfaces/ecpg/preproc/ecpg_kwlist.h
+++ b/src/interfaces/ecpg/preproc/ecpg_kwlist.h
@@ -20,8 +20,7 @@
/*
* List of (keyword-name, keyword-token-value) pairs.
*
- * !!WARNING!!: This list must be sorted by ASCII name, because binary
- * search is used to locate entries.
+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
*/
/* name, value */