ecpg: re-implement preprocessor's string management.

Most productions in the preprocessor grammar construct strings representing SQL or C statements or fragments thereof. Instead of returning these as <str> results of the productions, return them as "location" values, taking advantage of Bison's flexibility about what a location is. We aren't really giving up anything thereby, since ecpg's error reports have always just given line numbers, and that's tracked separately. The advantage of this is that a single instance of the YYLLOC_DEFAULT macro can perform all the work needed by the vast majority of productions, including all the ones made automatically by parse.pl. This avoids having large numbers of effectively-identical productions, which tickles an optimization inefficiency in recent versions of clang. (This patch reduces the compilation time for preproc.o by more than 100-fold with clang 16, and is visibly helpful with gcc too.) The compiled parser is noticeably smaller as well. A disadvantage of this approach is that YYLLOC_DEFAULT is applied before running the production's semantic action (if any). This means it cannot use the method favored by cat_str() of free'ing all the input strings; if the action needs to look at the input strings, it'd be looking at dangling storage. As this stands, therefore, it leaks memory like a sieve. This is already a big patch though, and fixing the memory management seems like a separable problem, so let's leave that for the next step. (This does remove some free() calls that I'd have had to touch anyway, in the expectation that the next step will manage memory reclamation quite differently.) Most of the changes here are mindless substitution of "@N" for "$N" in grammar rules; see the changes to README.parser for an explanation. Discussion: https://postgr.es/m/2011420.1713493114@sss.pgh.pa.us
author: Tom Lane <tgl@sss.pgh.pa.us> 2024-10-14 13:44:42 -0400
committer: Tom Lane <tgl@sss.pgh.pa.us> 2024-10-14 13:44:42 -0400
commit: a542d5614bdb6430094556162b9ca2f01d35f9dc (patch)
tree: a3171cd3c51af3c76b49c23876636dc601197bb0 /src/interfaces/ecpg/preproc/parser.c
parent: 6b005499447512abfa1e4add87dcf3e2859c4f9e (diff)
download: postgresql-a542d5614bdb6430094556162b9ca2f01d35f9dc.tar.gz
postgresql-a542d5614bdb6430094556162b9ca2f01d35f9dc.zip
1 files changed, 55 insertions, 3 deletions
diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c
index 9daeee33034..ca0dead26d0 100644
--- a/src/interfaces/ecpg/preproc/parser.c
+++ b/src/interfaces/ecpg/preproc/parser.c
@@ -31,6 +31,7 @@ static YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
 static YYLTYPE lookahead_yylloc;	/* yylloc for lookahead token */
 static char *lookahead_yytext;	/* start current token */
 
+static int	base_yylex_location(void);
 static bool check_uescapechar(unsigned char escape);
 static bool ecpg_isspace(char ch);
 
@@ -71,7 +72,7 @@ filtered_base_yylex(void)
 		have_lookahead = false;
 	}
 	else
-		cur_token = base_yylex();
+		cur_token = base_yylex_location();
 
 	/*
 	 * If this token isn't one that requires lookahead, just return it.
@@ -96,7 +97,7 @@ filtered_base_yylex(void)
 	cur_yytext = base_yytext;
 
 	/* Get next token, saving outputs into lookahead variables */
-	next_token = base_yylex();
+	next_token = base_yylex_location();
 
 	lookahead_token = next_token;
 	lookahead_yylval = base_yylval;
@@ -184,7 +185,7 @@ filtered_base_yylex(void)
 				cur_yytext = base_yytext;
 
 				/* Get third token */
-				next_token = base_yylex();
+				next_token = base_yylex_location();
 
 				if (next_token != SCONST)
 					mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
@@ -203,6 +204,7 @@ filtered_base_yylex(void)
 
 				/* Combine 3 tokens into 1 */
 				base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
+				base_yylloc = mm_strdup(base_yylval.str);
 
 				/* Clear have_lookahead, thereby consuming all three tokens */
 				have_lookahead = false;
@@ -219,6 +221,56 @@ filtered_base_yylex(void)
 }
 
 /*
+ * Call base_yylex() and fill in base_yylloc.
+ *
+ * pgc.l does not worry about setting yylloc, and given what we want for
+ * that, trying to set it there would be pretty inconvenient.  What we
+ * want is: if the returned token has type <str>, then duplicate its
+ * string value as yylloc; otherwise, make a downcased copy of yytext.
+ * The downcasing is ASCII-only because all that we care about there
+ * is producing uniformly-cased output of keywords.  (That's mostly
+ * cosmetic, but there are places in ecpglib that expect to receive
+ * downcased keywords, plus it keeps us regression-test-compatible
+ * with the pre-v18 implementation of ecpg.)
+ */
+static int
+base_yylex_location(void)
+{
+	int			token = base_yylex();
+
+	switch (token)
+	{
+			/* List a token here if pgc.l assigns to base_yylval.str for it */
+		case Op:
+		case CSTRING:
+		case CPP_LINE:
+		case CVARIABLE:
+		case BCONST:
+		case SCONST:
+		case USCONST:
+		case XCONST:
+		case FCONST:
+		case IDENT:
+		case UIDENT:
+		case IP:
+			/* Duplicate the <str> value */
+			base_yylloc = mm_strdup(base_yylval.str);
+			break;
+		default:
+			/* Else just use the input, i.e., yytext */
+			base_yylloc = mm_strdup(base_yytext);
+			/* Apply an ASCII-only downcasing */
+			for (unsigned char *ptr = (unsigned char *) base_yylloc; *ptr; ptr++)
+			{
+				if (*ptr >= 'A' && *ptr <= 'Z')
+					*ptr += 'a' - 'A';
+			}
+			break;
+	}
+	return token;
+}
+
+/*
  * check_uescapechar() and ecpg_isspace() should match their equivalents
  * in pgc.l.
  */
author	Tom Lane <tgl@sss.pgh.pa.us>	2024-10-14 13:44:42 -0400
committer	Tom Lane <tgl@sss.pgh.pa.us>	2024-10-14 13:44:42 -0400
commit	a542d5614bdb6430094556162b9ca2f01d35f9dc (patch)
tree	a3171cd3c51af3c76b49c23876636dc601197bb0 /src/interfaces/ecpg/preproc/parser.c
parent	6b005499447512abfa1e4add87dcf3e2859c4f9e (diff)
download	postgresql-a542d5614bdb6430094556162b9ca2f01d35f9dc.tar.gz postgresql-a542d5614bdb6430094556162b9ca2f01d35f9dc.zip