aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/backend/parser/gram.y10
-rw-r--r--src/backend/parser/parser.c282
-rw-r--r--src/backend/parser/scan.l414
-rw-r--r--src/fe_utils/psqlscan.l124
-rw-r--r--src/include/fe_utils/psqlscan_int.h1
-rw-r--r--src/include/mb/pg_wchar.h22
-rw-r--r--src/include/parser/kwlist.h1
-rw-r--r--src/include/parser/scanner.h3
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.tokens1
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.trailer38
-rw-r--r--src/interfaces/ecpg/preproc/ecpg.type6
-rw-r--r--src/interfaces/ecpg/preproc/parse.pl4
-rw-r--r--src/interfaces/ecpg/preproc/parser.c118
-rw-r--r--src/interfaces/ecpg/preproc/pgc.l247
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-strings.c2
-rw-r--r--src/interfaces/ecpg/test/expected/preproc-strings.stderr2
-rw-r--r--src/pl/plpgsql/src/pl_gram.y2
-rw-r--r--src/test/regress/expected/strings.out12
-rw-r--r--src/test/regress/sql/strings.sql1
19 files changed, 671 insertions, 619 deletions
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index ad5be902b07..3806687ae37 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -598,10 +598,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
* the set of keywords. PL/pgSQL depends on this so that it can share the
* same lexer. If you add/change tokens here, fix PL/pgSQL to match!
*
+ * UIDENT and USCONST are reduced to IDENT and SCONST in parser.c, so that
+ * they need no productions here; but we must assign token codes to them.
+ *
* DOT_DOT is unused in the core SQL grammar, and so will always provoke
* parse errors. It is needed by PL/pgSQL.
*/
-%token <str> IDENT FCONST SCONST BCONST XCONST Op
+%token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op
%token <ival> ICONST PARAM
%token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER
%token LESS_EQUALS GREATER_EQUALS NOT_EQUALS
@@ -691,8 +694,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
TREAT TRIGGER TRIM TRUE_P
TRUNCATE TRUSTED TYPE_P TYPES_P
- UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED
- UNTIL UPDATE USER USING
+ UESCAPE UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN
+ UNLISTEN UNLOGGED UNTIL UPDATE USER USING
VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING
VERBOSE VERSION_P VIEW VIEWS VOLATILE
@@ -15374,6 +15377,7 @@ unreserved_keyword:
| TRUSTED
| TYPE_P
| TYPES_P
+ | UESCAPE
| UNBOUNDED
| UNCOMMITTED
| UNENCRYPTED
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index bc3f812da8e..1bf1144c4fd 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -21,8 +21,14 @@
#include "postgres.h"
+#include "mb/pg_wchar.h"
#include "parser/gramparse.h"
#include "parser/parser.h"
+#include "parser/scansup.h"
+
+static bool check_uescapechar(unsigned char escape);
+static char *str_udeescape(const char *str, char escape,
+ int position, core_yyscan_t yyscanner);
/*
@@ -75,6 +81,10 @@ raw_parser(const char *str)
* scanner backtrack, which would cost more performance than this filter
* layer does.
*
+ * We also use this filter to convert UIDENT and USCONST sequences into
+ * plain IDENT and SCONST tokens. While that could be handled by additional
+ * productions in the main grammar, it's more efficient to do it like this.
+ *
* The filter also provides a convenient place to translate between
* the core_YYSTYPE and YYSTYPE representations (which are really the
* same thing anyway, but notationally they're different).
@@ -104,7 +114,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
* If this token isn't one that requires lookahead, just return it. If it
* does, determine the token length. (We could get that via strlen(), but
* since we have such a small set of possibilities, hardwiring seems
- * feasible and more efficient.)
+ * feasible and more efficient --- at least for the fixed-length cases.)
*/
switch (cur_token)
{
@@ -117,6 +127,10 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
case WITH:
cur_token_length = 4;
break;
+ case UIDENT:
+ case USCONST:
+ cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
+ break;
default:
return cur_token;
}
@@ -190,7 +204,273 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
break;
}
break;
+
+ case UIDENT:
+ case USCONST:
+ /* Look ahead for UESCAPE */
+ if (next_token == UESCAPE)
+ {
+ /* Yup, so get third token, which had better be SCONST */
+ const char *escstr;
+
+ /* Again save and restore *llocp */
+ cur_yylloc = *llocp;
+
+ /* Un-truncate current token so errors point to third token */
+ *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
+
+ /* Get third token */
+ next_token = core_yylex(&(yyextra->lookahead_yylval),
+ llocp, yyscanner);
+
+ /* If we throw error here, it will point to third token */
+ if (next_token != SCONST)
+ scanner_yyerror("UESCAPE must be followed by a simple string literal",
+ yyscanner);
+
+ escstr = yyextra->lookahead_yylval.str;
+ if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
+ scanner_yyerror("invalid Unicode escape character",
+ yyscanner);
+
+ /* Now restore *llocp; errors will point to first token */
+ *llocp = cur_yylloc;
+
+ /* Apply Unicode conversion */
+ lvalp->core_yystype.str =
+ str_udeescape(lvalp->core_yystype.str,
+ escstr[0],
+ *llocp,
+ yyscanner);
+
+ /*
+ * We don't need to revert the un-truncation of UESCAPE. What
+ * we do want to do is clear have_lookahead, thereby consuming
+ * all three tokens.
+ */
+ yyextra->have_lookahead = false;
+ }
+ else
+ {
+ /* No UESCAPE, so convert using default escape character */
+ lvalp->core_yystype.str =
+ str_udeescape(lvalp->core_yystype.str,
+ '\\',
+ *llocp,
+ yyscanner);
+ }
+
+ if (cur_token == UIDENT)
+ {
+ /* It's an identifier, so truncate as appropriate */
+ truncate_identifier(lvalp->core_yystype.str,
+ strlen(lvalp->core_yystype.str),
+ true);
+ cur_token = IDENT;
+ }
+ else if (cur_token == USCONST)
+ {
+ cur_token = SCONST;
+ }
+ break;
}
return cur_token;
}
+
+/* convert hex digit (caller should have verified that) to value */
+static unsigned int
+hexval(unsigned char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xA;
+ elog(ERROR, "invalid hexadecimal digit");
+ return 0; /* not reached */
+}
+
+/* is Unicode code point acceptable in database's encoding? */
+static void
+check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
+{
+ /* See also addunicode() in scan.l */
+ if (c == 0 || c > 0x10FFFF)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape value"),
+ scanner_errposition(pos, yyscanner)));
+
+ if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
+ scanner_errposition(pos, yyscanner)));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
+{
+ if (isxdigit(escape)
+ || escape == '+'
+ || escape == '\''
+ || escape == '"'
+ || scanner_isspace(escape))
+ return false;
+ else
+ return true;
+}
+
+/*
+ * Process Unicode escapes in "str", producing a palloc'd plain string
+ *
+ * escape: the escape character to use
+ * position: start position of U&'' or U&"" string token
+ * yyscanner: context information needed for error reports
+ */
+static char *
+str_udeescape(const char *str, char escape,
+ int position, core_yyscan_t yyscanner)
+{
+ const char *in;
+ char *new,
+ *out;
+ pg_wchar pair_first = 0;
+
+ /*
+ * This relies on the subtle assumption that a UTF-8 expansion cannot be
+ * longer than its escaped representation.
+ */
+ new = palloc(strlen(str) + 1);
+
+ in = str;
+ out = new;
+ while (*in)
+ {
+ if (in[0] == escape)
+ {
+ if (in[1] == escape)
+ {
+ if (pair_first)
+ goto invalid_pair;
+ *out++ = escape;
+ in += 2;
+ }
+ else if (isxdigit((unsigned char) in[1]) &&
+ isxdigit((unsigned char) in[2]) &&
+ isxdigit((unsigned char) in[3]) &&
+ isxdigit((unsigned char) in[4]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval(in[1]) << 12) +
+ (hexval(in[2]) << 8) +
+ (hexval(in[3]) << 4) +
+ hexval(in[4]);
+ check_unicode_value(unicode,
+ in - str + position + 3, /* 3 for U&" */
+ yyscanner);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
+ in += 5;
+ }
+ else if (in[1] == '+' &&
+ isxdigit((unsigned char) in[2]) &&
+ isxdigit((unsigned char) in[3]) &&
+ isxdigit((unsigned char) in[4]) &&
+ isxdigit((unsigned char) in[5]) &&
+ isxdigit((unsigned char) in[6]) &&
+ isxdigit((unsigned char) in[7]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval(in[2]) << 20) +
+ (hexval(in[3]) << 16) +
+ (hexval(in[4]) << 12) +
+ (hexval(in[5]) << 8) +
+ (hexval(in[6]) << 4) +
+ hexval(in[7]);
+ check_unicode_value(unicode,
+ in - str + position + 3, /* 3 for U&" */
+ yyscanner);
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ unicode_to_utf8(unicode, (unsigned char *) out);
+ out += pg_mblen(out);
+ }
+ in += 8;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape value"),
+ scanner_errposition(in - str + position + 3, /* 3 for U&" */
+ yyscanner)));
+ }
+ else
+ {
+ if (pair_first)
+ goto invalid_pair;
+
+ *out++ = *in++;
+ }
+ }
+
+ /* unfinished surrogate pair? */
+ if (pair_first)
+ goto invalid_pair;
+
+ *out = '\0';
+
+ /*
+ * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
+ * codes; but it's probably not worth the trouble, since this isn't likely
+ * to be a performance-critical path.
+ */
+ pg_verifymbstr(new, out - new, false);
+ return new;
+
+invalid_pair:
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode surrogate pair"),
+ scanner_errposition(in - str + position + 3, /* 3 for U&" */
+ yyscanner)));
+ return NULL; /* keep compiler quiet */
+}
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index e25e12e4614..84c73914a85 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -13,8 +13,8 @@
* in the sense that there is always a rule that can match the input
* consumed so far (the rule action may internally throw back some input
* with yyless(), however). As explained in the flex manual, this makes
- * for a useful speed increase --- about a third faster than a plain -CF
- * lexer, in simple testing. The extra complexity is mostly in the rules
+ * for a useful speed increase --- several percent faster when measuring
+ * raw parsing (Flex + Bison). The extra complexity is mostly in the rules
* for handling float numbers and continued string literals. If you change
* the lexical rules, verify that you haven't broken the no-backtrack
* property by running flex with the "-b" option and checking that the
@@ -110,14 +110,9 @@ const uint16 ScanKeywordTokens[] = {
static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
static char *litbufdup(core_yyscan_t yyscanner);
-static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
static int process_integer_literal(const char *token, YYSTYPE *lval);
-static bool is_utf16_surrogate_first(pg_wchar c);
-static bool is_utf16_surrogate_second(pg_wchar c);
-static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
static void addunicode(pg_wchar c, yyscan_t yyscanner);
-static bool check_uescapechar(unsigned char escape);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@@ -168,12 +163,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
* <xd> delimited identifiers (double-quoted identifiers)
* <xh> hexadecimal numeric string
* <xq> standard quoted strings
+ * <xqs> quote stop (detect continued strings)
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
- * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
* <xus> quoted string with Unicode escapes
- * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
* <xeu> Unicode surrogate pair in extended quoted string
*
* Remember to add an <<EOF>> case whenever you add a new exclusive state!
@@ -185,12 +179,11 @@ extern void core_yyset_column(int column_no, yyscan_t yyscanner);
%x xd
%x xh
%x xq
+%x xqs
%x xe
%x xdolq
%x xui
-%x xuiend
%x xus
-%x xusend
%x xeu
/*
@@ -231,19 +224,18 @@ special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
+quote '
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue {whitespace_with_newline}{quote}
+
/*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}. This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}). To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}. It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
*/
-quote '
-quotestop {quote}{whitespace}*
-quotecontinue {quote}{whitespace_with_newline}{quote}
-quotefail {quote}{whitespace}*"-"
+quotecontinuefail {whitespace}*"-"?
/* Bit string
* It is tempting to scan the string for only those characters
@@ -304,21 +296,12 @@ xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
-/* Unicode escapes */
-uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
-/* error rule to avoid backup */
-uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
-
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
-/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
-xustop1 {uescapefail}?
-xustop2 {uescape}
-
/* error rule to avoid backup */
xufailed [uU]&
@@ -476,21 +459,10 @@ other .
startlit();
addlitchar('b', yyscanner);
}
-<xb>{quotestop} |
-<xb>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
- yylval->str = litbufdup(yyscanner);
- return BCONST;
- }
<xh>{xhinside} |
<xb>{xbinside} {
addlit(yytext, yyleng, yyscanner);
}
-<xh>{quotecontinue} |
-<xb>{quotecontinue} {
- /* ignore */
- }
<xb><<EOF>> { yyerror("unterminated bit string literal"); }
{xhstart} {
@@ -505,13 +477,6 @@ other .
startlit();
addlitchar('x', yyscanner);
}
-<xh>{quotestop} |
-<xh>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
- yylval->str = litbufdup(yyscanner);
- return XCONST;
- }
<xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
{xnstart} {
@@ -568,53 +533,66 @@ other .
BEGIN(xus);
startlit();
}
-<xq,xe>{quotestop} |
-<xq,xe>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
+
+<xb,xh,xq,xe,xus>{quote} {
/*
- * check that the data remains valid if it might have been
- * made invalid by unescaping any chars.
+ * When we are scanning a quoted string and see an end
+ * quote, we must look ahead for a possible continuation.
+ * If we don't see one, we know the end quote was in fact
+ * the end of the string. To reduce the lexer table size,
+ * we use a single "xqs" state to do the lookahead for all
+ * types of strings.
*/
- if (yyextra->saw_non_ascii)
- pg_verifymbstr(yyextra->literalbuf,
- yyextra->literallen,
- false);
- yylval->str = litbufdup(yyscanner);
- return SCONST;
- }
-<xus>{quotestop} |
-<xus>{quotefail} {
- /* throw back all but the quote */
- yyless(1);
- /* xusend state looks for possible UESCAPE */
- BEGIN(xusend);
+ yyextra->state_before_str_stop = YYSTATE;
+ BEGIN(xqs);
}
-<xusend>{whitespace} {
- /* stay in xusend state over whitespace */
+<xqs>{quotecontinue} {
+ /*
+ * Found a quote continuation, so return to the in-quote
+ * state and continue scanning the literal. Nothing is
+ * added to the literal's contents.
+ */
+ BEGIN(yyextra->state_before_str_stop);
}
-<xusend><<EOF>> |
-<xusend>{other} |
-<xusend>{xustop1} {
- /* no UESCAPE after the quote, throw back everything */
+<xqs>{quotecontinuefail} |
+<xqs>{other} |
+<xqs><<EOF>> {
+ /*
+ * Failed to see a quote continuation. Throw back
+ * everything after the end quote, and handle the string
+ * according to the state we were in previously.
+ */
yyless(0);
BEGIN(INITIAL);
- yylval->str = litbuf_udeescape('\\', yyscanner);
- return SCONST;
- }
-<xusend>{xustop2} {
- /* found UESCAPE after the end quote */
- BEGIN(INITIAL);
- if (!check_uescapechar(yytext[yyleng - 2]))
+
+ switch (yyextra->state_before_str_stop)
{
- SET_YYLLOC();
- ADVANCE_YYLLOC(yyleng - 2);
- yyerror("invalid Unicode escape character");
+ case xb:
+ yylval->str = litbufdup(yyscanner);
+ return BCONST;
+ case xh:
+ yylval->str = litbufdup(yyscanner);
+ return XCONST;
+ case xq:
+ case xe:
+ /*
+ * Check that the data remains valid, if it might
+ * have been made invalid by unescaping any chars.
+ */
+ if (yyextra->saw_non_ascii)
+ pg_verifymbstr(yyextra->literalbuf,
+ yyextra->literallen,
+ false);
+ yylval->str = litbufdup(yyscanner);
+ return SCONST;
+ case xus:
+ yylval->str = litbufdup(yyscanner);
+ return USCONST;
+ default:
+ yyerror("unhandled previous state in xqs");
}
- yylval->str = litbuf_udeescape(yytext[yyleng - 2],
- yyscanner);
- return SCONST;
}
+
<xq,xe,xus>{xqdouble} {
addlitchar('\'', yyscanner);
}
@@ -693,9 +671,6 @@ other .
if (c == '\0' || IS_HIGHBIT_SET(c))
yyextra->saw_non_ascii = true;
}
-<xq,xe,xus>{quotecontinue} {
- /* ignore */
- }
<xe>. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0], yyscanner);
@@ -769,53 +744,13 @@ other .
yylval->str = ident;
return IDENT;
}
-<xui>{dquote} {
- yyless(1);
- /* xuiend state looks for possible UESCAPE */
- BEGIN(xuiend);
- }
-<xuiend>{whitespace} {
- /* stay in xuiend state over whitespace */
- }
-<xuiend><<EOF>> |
-<xuiend>{other} |
-<xuiend>{xustop1} {
- /* no UESCAPE after the quote, throw back everything */
- char *ident;
- int identlen;
-
- yyless(0);
-
- BEGIN(INITIAL);
- if (yyextra->literallen == 0)
- yyerror("zero-length delimited identifier");
- ident = litbuf_udeescape('\\', yyscanner);
- identlen = strlen(ident);
- if (identlen >= NAMEDATALEN)
- truncate_identifier(ident, identlen, true);
- yylval->str = ident;
- return IDENT;
- }
-<xuiend>{xustop2} {
- /* found UESCAPE after the end quote */
- char *ident;
- int identlen;
-
+<xui>{dquote} {
BEGIN(INITIAL);
if (yyextra->literallen == 0)
yyerror("zero-length delimited identifier");
- if (!check_uescapechar(yytext[yyleng - 2]))
- {
- SET_YYLLOC();
- ADVANCE_YYLLOC(yyleng - 2);
- yyerror("invalid Unicode escape character");
- }
- ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
- identlen = strlen(ident);
- if (identlen >= NAMEDATALEN)
- truncate_identifier(ident, identlen, true);
- yylval->str = ident;
- return IDENT;
+ /* can't truncate till after we de-escape the ident */
+ yylval->str = litbufdup(yyscanner);
+ return UIDENT;
}
<xd,xui>{xddouble} {
addlitchar('"', yyscanner);
@@ -1288,55 +1223,12 @@ process_integer_literal(const char *token, YYSTYPE *lval)
return ICONST;
}
-static unsigned int
-hexval(unsigned char c)
-{
- if (c >= '0' && c <= '9')
- return c - '0';
- if (c >= 'a' && c <= 'f')
- return c - 'a' + 0xA;
- if (c >= 'A' && c <= 'F')
- return c - 'A' + 0xA;
- elog(ERROR, "invalid hexadecimal digit");
- return 0; /* not reached */
-}
-
-static void
-check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
-{
- if (GetDatabaseEncoding() == PG_UTF8)
- return;
-
- if (c > 0x7F)
- {
- ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
- yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
- }
-}
-
-static bool
-is_utf16_surrogate_first(pg_wchar c)
-{
- return (c >= 0xD800 && c <= 0xDBFF);
-}
-
-static bool
-is_utf16_surrogate_second(pg_wchar c)
-{
- return (c >= 0xDC00 && c <= 0xDFFF);
-}
-
-static pg_wchar
-surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
-{
- return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
-}
-
static void
addunicode(pg_wchar c, core_yyscan_t yyscanner)
{
char buf[8];
+ /* See also check_unicode_value() in parser.c */
if (c == 0 || c > 0x10FFFF)
yyerror("invalid Unicode escape value");
if (c > 0x7F)
@@ -1349,172 +1241,6 @@ addunicode(pg_wchar c, core_yyscan_t yyscanner)
addlit(buf, pg_mblen(buf), yyscanner);
}
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
- if (isxdigit(escape)
- || escape == '+'
- || escape == '\''
- || escape == '"'
- || scanner_isspace(escape))
- {
- return false;
- }
- else
- return true;
-}
-
-/* like litbufdup, but handle unicode escapes */
-static char *
-litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
-{
- char *new;
- char *litbuf,
- *in,
- *out;
- pg_wchar pair_first = 0;
-
- /* Make literalbuf null-terminated to simplify the scanning loop */
- litbuf = yyextra->literalbuf;
- litbuf[yyextra->literallen] = '\0';
-
- /*
- * This relies on the subtle assumption that a UTF-8 expansion cannot be
- * longer than its escaped representation.
- */
- new = palloc(yyextra->literallen + 1);
-
- in = litbuf;
- out = new;
- while (*in)
- {
- if (in[0] == escape)
- {
- if (in[1] == escape)
- {
- if (pair_first)
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode surrogate pair");
- }
- *out++ = escape;
- in += 2;
- }
- else if (isxdigit((unsigned char) in[1]) &&
- isxdigit((unsigned char) in[2]) &&
- isxdigit((unsigned char) in[3]) &&
- isxdigit((unsigned char) in[4]))
- {
- pg_wchar unicode;
-
- unicode = (hexval(in[1]) << 12) +
- (hexval(in[2]) << 8) +
- (hexval(in[3]) << 4) +
- hexval(in[4]);
- check_unicode_value(unicode, in, yyscanner);
- if (pair_first)
- {
- if (is_utf16_surrogate_second(unicode))
- {
- unicode = surrogate_pair_to_codepoint(pair_first, unicode);
- pair_first = 0;
- }
- else
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode surrogate pair");
- }
- }
- else if (is_utf16_surrogate_second(unicode))
- yyerror("invalid Unicode surrogate pair");
-
- if (is_utf16_surrogate_first(unicode))
- pair_first = unicode;
- else
- {
- unicode_to_utf8(unicode, (unsigned char *) out);
- out += pg_mblen(out);
- }
- in += 5;
- }
- else if (in[1] == '+' &&
- isxdigit((unsigned char) in[2]) &&
- isxdigit((unsigned char) in[3]) &&
- isxdigit((unsigned char) in[4]) &&
- isxdigit((unsigned char) in[5]) &&
- isxdigit((unsigned char) in[6]) &&
- isxdigit((unsigned char) in[7]))
- {
- pg_wchar unicode;
-
- unicode = (hexval(in[2]) << 20) +
- (hexval(in[3]) << 16) +
- (hexval(in[4]) << 12) +
- (hexval(in[5]) << 8) +
- (hexval(in[6]) << 4) +
- hexval(in[7]);
- check_unicode_value(unicode, in, yyscanner);
- if (pair_first)
- {
- if (is_utf16_surrogate_second(unicode))
- {
- unicode = surrogate_pair_to_codepoint(pair_first, unicode);
- pair_first = 0;
- }
- else
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode surrogate pair");
- }
- }
- else if (is_utf16_surrogate_second(unicode))
- yyerror("invalid Unicode surrogate pair");
-
- if (is_utf16_surrogate_first(unicode))
- pair_first = unicode;
- else
- {
- unicode_to_utf8(unicode, (unsigned char *) out);
- out += pg_mblen(out);
- }
- in += 8;
- }
- else
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode escape value");
- }
- }
- else
- {
- if (pair_first)
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode surrogate pair");
- }
- *out++ = *in++;
- }
- }
-
- /* unfinished surrogate pair? */
- if (pair_first)
- {
- ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
- yyerror("invalid Unicode surrogate pair");
- }
-
- *out = '\0';
-
- /*
- * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
- * codes; but it's probably not worth the trouble, since this isn't likely
- * to be a performance-critical path.
- */
- pg_verifymbstr(new, out - new, false);
- return new;
-}
-
static unsigned char
unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
{
diff --git a/src/fe_utils/psqlscan.l b/src/fe_utils/psqlscan.l
index 02cb356f343..08dffde1ba0 100644
--- a/src/fe_utils/psqlscan.l
+++ b/src/fe_utils/psqlscan.l
@@ -114,12 +114,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
* <xd> delimited identifiers (double-quoted identifiers)
* <xh> hexadecimal numeric string
* <xq> standard quoted strings
+ * <xqs> quote stop (detect continued strings)
* <xe> extended quoted strings (support backslash escape sequences)
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
- * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
* <xus> quoted string with Unicode escapes
- * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
*
* Note: we intentionally don't mimic the backend's <xeu> state; we have
* no need to distinguish it from <xe> state, and no good way to get out
@@ -132,12 +131,11 @@ extern void psql_yyset_column(int column_no, yyscan_t yyscanner);
%x xd
%x xh
%x xq
+%x xqs
%x xe
%x xdolq
%x xui
-%x xuiend
%x xus
-%x xusend
/*
* In order to make the world safe for Windows and Mac clients as well as
@@ -177,19 +175,18 @@ special_whitespace ({space}+|{comment}{newline})
horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
+quote '
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue {whitespace_with_newline}{quote}
+
/*
- * To ensure that {quotecontinue} can be scanned without having to back up
- * if the full pattern isn't matched, we include trailing whitespace in
- * {quotestop}. This matches all cases where {quotecontinue} fails to match,
- * except for {quote} followed by whitespace and just one "-" (not two,
- * which would start a {comment}). To cover that we have {quotefail}.
- * The actions for {quotestop} and {quotefail} must throw back characters
- * beyond the quote proper.
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}. It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
*/
-quote '
-quotestop {quote}{whitespace}*
-quotecontinue {quote}{whitespace_with_newline}{quote}
-quotefail {quote}{whitespace}*"-"
+quotecontinuefail {whitespace}*"-"?
/* Bit string
* It is tempting to scan the string for only those characters
@@ -250,21 +247,12 @@ xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
-/* Unicode escapes */
-uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
-/* error rule to avoid backup */
-uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
-
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
-/* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
-xustop1 {uescapefail}?
-xustop2 {uescape}
-
/* error rule to avoid backup */
xufailed [uU]&
@@ -438,20 +426,10 @@ other .
BEGIN(xb);
ECHO;
}
-<xb>{quotestop} |
-<xb>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
- ECHO;
- }
<xh>{xhinside} |
<xb>{xbinside} {
ECHO;
}
-<xh>{quotecontinue} |
-<xb>{quotecontinue} {
- ECHO;
- }
{xhstart} {
/* Hexadecimal bit type.
@@ -463,12 +441,6 @@ other .
BEGIN(xh);
ECHO;
}
-<xh>{quotestop} |
-<xh>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
- ECHO;
- }
{xnstart} {
yyless(1); /* eat only 'n' this time */
@@ -490,32 +462,41 @@ other .
BEGIN(xus);
ECHO;
}
-<xq,xe>{quotestop} |
-<xq,xe>{quotefail} {
- yyless(1);
- BEGIN(INITIAL);
- ECHO;
- }
-<xus>{quotestop} |
-<xus>{quotefail} {
- /* throw back all but the quote */
- yyless(1);
- BEGIN(xusend);
+
+<xb,xh,xq,xe,xus>{quote} {
+ /*
+ * When we are scanning a quoted string and see an end
+ * quote, we must look ahead for a possible continuation.
+ * If we don't see one, we know the end quote was in fact
+ * the end of the string. To reduce the lexer table size,
+ * we use a single "xqs" state to do the lookahead for all
+ * types of strings.
+ */
+ cur_state->state_before_str_stop = YYSTATE;
+ BEGIN(xqs);
ECHO;
}
-<xusend>{whitespace} {
+<xqs>{quotecontinue} {
+ /*
+ * Found a quote continuation, so return to the in-quote
+ * state and continue scanning the literal. Nothing is
+ * added to the literal's contents.
+ */
+ BEGIN(cur_state->state_before_str_stop);
ECHO;
}
-<xusend>{other} |
-<xusend>{xustop1} {
+<xqs>{quotecontinuefail} |
+<xqs>{other} {
+ /*
+ * Failed to see a quote continuation. Throw back
+ * everything after the end quote, and handle the string
+ * according to the state we were in previously.
+ */
yyless(0);
BEGIN(INITIAL);
- ECHO;
- }
-<xusend>{xustop2} {
- BEGIN(INITIAL);
- ECHO;
+ /* There's nothing to echo ... */
}
+
<xq,xe,xus>{xqdouble} {
ECHO;
}
@@ -540,9 +521,6 @@ other .
<xe>{xehexesc} {
ECHO;
}
-<xq,xe,xus>{quotecontinue} {
- ECHO;
- }
<xe>. {
/* This is only needed for \ just before EOF */
ECHO;
@@ -599,21 +577,7 @@ other .
BEGIN(INITIAL);
ECHO;
}
-<xui>{dquote} {
- yyless(1);
- BEGIN(xuiend);
- ECHO;
- }
-<xuiend>{whitespace} {
- ECHO;
- }
-<xuiend>{other} |
-<xuiend>{xustop1} {
- yyless(0);
- BEGIN(INITIAL);
- ECHO;
- }
-<xuiend>{xustop2} {
+<xui>{dquote} {
BEGIN(INITIAL);
ECHO;
}
@@ -1084,8 +1048,7 @@ psql_scan(PsqlScanState state,
switch (state->start_state)
{
case INITIAL:
- case xuiend: /* we treat these like INITIAL */
- case xusend:
+ case xqs: /* we treat this like INITIAL */
if (state->paren_depth > 0)
{
result = PSCAN_INCOMPLETE;
@@ -1240,7 +1203,8 @@ psql_scan_reselect_sql_lexer(PsqlScanState state)
bool
psql_scan_in_quote(PsqlScanState state)
{
- return state->start_state != INITIAL;
+ return state->start_state != INITIAL &&
+ state->start_state != xqs;
}
/*
diff --git a/src/include/fe_utils/psqlscan_int.h b/src/include/fe_utils/psqlscan_int.h
index 98481e65531..311f80394a4 100644
--- a/src/include/fe_utils/psqlscan_int.h
+++ b/src/include/fe_utils/psqlscan_int.h
@@ -110,6 +110,7 @@ typedef struct PsqlScanStateData
* and updated with its finishing state on exit.
*/
int start_state; /* yylex's starting/finishing state */
+ int state_before_str_stop; /* start cond. before end quote */
int paren_depth; /* depth of nesting in parentheses */
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 07ebc6365b1..7fb5fa41117 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -509,6 +509,28 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
/*
+ * Some handy functions for Unicode-specific tests.
+ */
+static inline bool
+is_utf16_surrogate_first(pg_wchar c)
+{
+ return (c >= 0xD800 && c <= 0xDBFF);
+}
+
+static inline bool
+is_utf16_surrogate_second(pg_wchar c)
+{
+ return (c >= 0xDC00 && c <= 0xDFFF);
+}
+
+static inline pg_wchar
+surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+{
+ return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
+}
+
+
+/*
* These functions are considered part of libpq's exported API and
* are also declared in libpq-fe.h.
*/
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 0fe4e6cb20d..9097f6748bd 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -416,6 +416,7 @@ PG_KEYWORD("truncate", TRUNCATE, UNRESERVED_KEYWORD)
PG_KEYWORD("trusted", TRUSTED, UNRESERVED_KEYWORD)
PG_KEYWORD("type", TYPE_P, UNRESERVED_KEYWORD)
PG_KEYWORD("types", TYPES_P, UNRESERVED_KEYWORD)
+PG_KEYWORD("uescape", UESCAPE, UNRESERVED_KEYWORD)
PG_KEYWORD("unbounded", UNBOUNDED, UNRESERVED_KEYWORD)
PG_KEYWORD("uncommitted", UNCOMMITTED, UNRESERVED_KEYWORD)
PG_KEYWORD("unencrypted", UNENCRYPTED, UNRESERVED_KEYWORD)
diff --git a/src/include/parser/scanner.h b/src/include/parser/scanner.h
index da729fc42b3..7a0e5e5d982 100644
--- a/src/include/parser/scanner.h
+++ b/src/include/parser/scanner.h
@@ -48,7 +48,7 @@ typedef union core_YYSTYPE
* However, those are not defined in this file, because bison insists on
* defining them for itself. The token codes used by the core scanner are
* the ASCII characters plus these:
- * %token <str> IDENT FCONST SCONST BCONST XCONST Op
+ * %token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op
* %token <ival> ICONST PARAM
* %token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER
* %token LESS_EQUALS GREATER_EQUALS NOT_EQUALS
@@ -99,6 +99,7 @@ typedef struct core_yy_extra_type
int literallen; /* actual current string length */
int literalalloc; /* current allocated buffer size */
+ int state_before_str_stop; /* start cond. before end quote */
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
diff --git a/src/interfaces/ecpg/preproc/ecpg.tokens b/src/interfaces/ecpg/preproc/ecpg.tokens
index 1d613af02f6..8e0527fdb77 100644
--- a/src/interfaces/ecpg/preproc/ecpg.tokens
+++ b/src/interfaces/ecpg/preproc/ecpg.tokens
@@ -24,4 +24,3 @@
S_TYPEDEF
%token CSTRING CVARIABLE CPP_LINE IP
-%token DOLCONST ECONST NCONST UCONST UIDENT
diff --git a/src/interfaces/ecpg/preproc/ecpg.trailer b/src/interfaces/ecpg/preproc/ecpg.trailer
index f58b41e6751..0dbdfdc1223 100644
--- a/src/interfaces/ecpg/preproc/ecpg.trailer
+++ b/src/interfaces/ecpg/preproc/ecpg.trailer
@@ -1719,46 +1719,12 @@ ecpg_bconst: BCONST { $$ = make_name(); } ;
ecpg_fconst: FCONST { $$ = make_name(); } ;
-ecpg_sconst:
- SCONST
- {
- /* could have been input as '' or $$ */
- $$ = (char *)mm_alloc(strlen($1) + 3);
- $$[0]='\'';
- strcpy($$+1, $1);
- $$[strlen($1)+1]='\'';
- $$[strlen($1)+2]='\0';
- free($1);
- }
- | ECONST
- {
- $$ = (char *)mm_alloc(strlen($1) + 4);
- $$[0]='E';
- $$[1]='\'';
- strcpy($$+2, $1);
- $$[strlen($1)+2]='\'';
- $$[strlen($1)+3]='\0';
- free($1);
- }
- | NCONST
- {
- $$ = (char *)mm_alloc(strlen($1) + 4);
- $$[0]='N';
- $$[1]='\'';
- strcpy($$+2, $1);
- $$[strlen($1)+2]='\'';
- $$[strlen($1)+3]='\0';
- free($1);
- }
- | UCONST { $$ = $1; }
- | DOLCONST { $$ = $1; }
- ;
+ecpg_sconst: SCONST { $$ = $1; } ;
ecpg_xconst: XCONST { $$ = make_name(); } ;
-ecpg_ident: IDENT { $$ = make_name(); }
+ecpg_ident: IDENT { $$ = $1; }
| CSTRING { $$ = make3_str(mm_strdup("\""), $1, mm_strdup("\"")); }
- | UIDENT { $$ = $1; }
;
quoted_ident_stringvar: name
diff --git a/src/interfaces/ecpg/preproc/ecpg.type b/src/interfaces/ecpg/preproc/ecpg.type
index 9497b91b9db..ffafa82af9c 100644
--- a/src/interfaces/ecpg/preproc/ecpg.type
+++ b/src/interfaces/ecpg/preproc/ecpg.type
@@ -122,12 +122,8 @@
%type <str> CSTRING
%type <str> CPP_LINE
%type <str> CVARIABLE
-%type <str> DOLCONST
-%type <str> ECONST
-%type <str> NCONST
%type <str> SCONST
-%type <str> UCONST
-%type <str> UIDENT
+%type <str> IDENT
%type <struct_union> s_struct_union_symbol
diff --git a/src/interfaces/ecpg/preproc/parse.pl b/src/interfaces/ecpg/preproc/parse.pl
index 7d6c70dcf4e..1a76b2d326b 100644
--- a/src/interfaces/ecpg/preproc/parse.pl
+++ b/src/interfaces/ecpg/preproc/parse.pl
@@ -218,8 +218,8 @@ sub main
if ($a eq 'IDENT' && $prior eq '%nonassoc')
{
- # add two more tokens to the list
- $str = $str . "\n%nonassoc CSTRING\n%nonassoc UIDENT";
+ # add more tokens to the list
+ $str = $str . "\n%nonassoc CSTRING";
}
$prior = $a;
}
diff --git a/src/interfaces/ecpg/preproc/parser.c b/src/interfaces/ecpg/preproc/parser.c
index c27de59828a..a2eeeba2174 100644
--- a/src/interfaces/ecpg/preproc/parser.c
+++ b/src/interfaces/ecpg/preproc/parser.c
@@ -6,6 +6,9 @@
* This should match src/backend/parser/parser.c, except that we do not
* need to bother with re-entrant interfaces.
*
+ * Note: ECPG doesn't report error location like the backend does.
+ * This file will need work if we ever want it to.
+ *
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -27,8 +30,9 @@ static int lookahead_token; /* one-token lookahead */
static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
static char *lookahead_yytext; /* start current token */
-static char *lookahead_end; /* end of current token */
-static char lookahead_hold_char; /* to be put back at *lookahead_end */
+
+static bool check_uescapechar(unsigned char escape);
+static bool ecpg_isspace(char ch);
/*
@@ -43,13 +47,16 @@ static char lookahead_hold_char; /* to be put back at *lookahead_end */
* words. Furthermore it's not clear how to do that without re-introducing
* scanner backtrack, which would cost more performance than this filter
* layer does.
+ *
+ * We also use this filter to convert UIDENT and USCONST sequences into
+ * plain IDENT and SCONST tokens. While that could be handled by additional
+ * productions in the main grammar, it's more efficient to do it like this.
*/
int
filtered_base_yylex(void)
{
int cur_token;
int next_token;
- int cur_token_length;
YYSTYPE cur_yylval;
YYLTYPE cur_yylloc;
char *cur_yytext;
@@ -61,41 +68,26 @@ filtered_base_yylex(void)
base_yylval = lookahead_yylval;
base_yylloc = lookahead_yylloc;
base_yytext = lookahead_yytext;
- *lookahead_end = lookahead_hold_char;
have_lookahead = false;
}
else
cur_token = base_yylex();
/*
- * If this token isn't one that requires lookahead, just return it. If it
- * does, determine the token length. (We could get that via strlen(), but
- * since we have such a small set of possibilities, hardwiring seems
- * feasible and more efficient.)
+ * If this token isn't one that requires lookahead, just return it.
*/
switch (cur_token)
{
case NOT:
- cur_token_length = 3;
- break;
case NULLS_P:
- cur_token_length = 5;
- break;
case WITH:
- cur_token_length = 4;
+ case UIDENT:
+ case USCONST:
break;
default:
return cur_token;
}
- /*
- * Identify end+1 of current token. base_yylex() has temporarily stored a
- * '\0' here, and will undo that when we call it again. We need to redo
- * it to fully revert the lookahead call for error reporting purposes.
- */
- lookahead_end = base_yytext + cur_token_length;
- Assert(*lookahead_end == '\0');
-
/* Save and restore lexer output variables around the call */
cur_yylval = base_yylval;
cur_yylloc = base_yylloc;
@@ -113,10 +105,6 @@ filtered_base_yylex(void)
base_yylloc = cur_yylloc;
base_yytext = cur_yytext;
- /* Now revert the un-truncation of the current token */
- lookahead_hold_char = *lookahead_end;
- *lookahead_end = '\0';
-
have_lookahead = true;
/* Replace cur_token if needed, based on lookahead */
@@ -157,7 +145,87 @@ filtered_base_yylex(void)
break;
}
break;
+ case UIDENT:
+ case USCONST:
+ /* Look ahead for UESCAPE */
+ if (next_token == UESCAPE)
+ {
+ /* Yup, so get third token, which had better be SCONST */
+ const char *escstr;
+
+ /*
+ * Again save and restore lexer output variables around the
+ * call
+ */
+ cur_yylval = base_yylval;
+ cur_yylloc = base_yylloc;
+ cur_yytext = base_yytext;
+
+ /* Get third token */
+ next_token = base_yylex();
+
+ if (next_token != SCONST)
+ mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
+
+ /*
+ * Save and check escape string, which the scanner returns
+ * with quotes
+ */
+ escstr = base_yylval.str;
+ if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
+ mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
+
+ base_yylval = cur_yylval;
+ base_yylloc = cur_yylloc;
+ base_yytext = cur_yytext;
+
+ /* Combine 3 tokens into 1 */
+ base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
+
+ /* Clear have_lookahead, thereby consuming all three tokens */
+ have_lookahead = false;
+ }
+
+ if (cur_token == UIDENT)
+ cur_token = IDENT;
+ else if (cur_token == USCONST)
+ cur_token = SCONST;
+ break;
}
return cur_token;
}
+
+/*
+ * check_uescapechar() and ecpg_isspace() should match their equivalents
+ * in pgc.l.
+ */
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+static bool
+check_uescapechar(unsigned char escape)
+{
+ if (isxdigit(escape)
+ || escape == '+'
+ || escape == '\''
+ || escape == '"'
+ || ecpg_isspace(escape))
+ return false;
+ else
+ return true;
+}
+
+/*
+ * ecpg_isspace() --- return true if flex scanner considers char whitespace
+ */
+static bool
+ecpg_isspace(char ch)
+{
+ if (ch == ' ' ||
+ ch == '\t' ||
+ ch == '\n' ||
+ ch == '\r' ||
+ ch == '\f')
+ return true;
+ return false;
+}
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index 0385fde7196..0e8621a05e2 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -6,6 +6,9 @@
*
* This is a modified version of src/backend/parser/scan.l
*
+ * The ecpg scanner is not backup-free, so the fail rules are
+ * only here to simplify syncing this file with scan.l.
+ *
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -61,7 +64,10 @@ static bool isdefine(void);
static bool isinformixdefine(void);
char *token_start;
-static int state_before;
+
+/* vars to keep track of start conditions when scanning literals */
+static int state_before_str_start;
+static int state_before_str_stop;
struct _yy_buffer
{
@@ -105,13 +111,13 @@ static struct _if_value
* and to eliminate parsing troubles for numeric strings.
* Exclusive states:
* <xb> bit string literal
- * <xcc> extended C-style comments in C
- * <xcsql> extended C-style comments in SQL
+ * <xc> extended C-style comments
* <xd> delimited identifiers (double-quoted identifiers)
* <xdc> double-quoted strings in C
* <xh> hexadecimal numeric string
* <xn> national character quoted strings
* <xq> standard quoted strings
+ * <xqs> quote stop (detect continued strings)
* <xe> extended quoted strings (support backslash escape sequences)
* <xqc> single-quoted strings in C
* <xdolq> $foo$ quoted strings
@@ -120,18 +126,21 @@ static struct _if_value
* <xcond> condition of an EXEC SQL IFDEF construct
* <xskip> skipping the inactive part of an EXEC SQL IFDEF construct
*
+ * Note: we intentionally don't mimic the backend's <xeu> state; we have
+ * no need to distinguish it from <xe> state.
+ *
* Remember to add an <<EOF>> case whenever you add a new exclusive state!
* The default one is probably not the right thing.
*/
%x xb
-%x xcc
-%x xcsql
+%x xc
%x xd
%x xdc
%x xh
%x xn
%x xq
+%x xqs
%x xe
%x xqc
%x xdolq
@@ -181,9 +190,17 @@ horiz_whitespace ({horiz_space}|{comment})
whitespace_with_newline ({horiz_whitespace}*{newline}{whitespace}*)
quote '
-quotestop {quote}{whitespace}*
-quotecontinue {quote}{whitespace_with_newline}{quote}
-quotefail {quote}{whitespace}*"-"
+/* If we see {quote} then {quotecontinue}, the quoted string continues */
+quotecontinue {whitespace_with_newline}{quote}
+
+/*
+ * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
+ * {quotecontinue}. It might seem that this could just be {whitespace}*,
+ * but if there's a dash after {whitespace_with_newline}, it must be consumed
+ * to see if there's another dash --- which would start a {comment} and thus
+ * allow continuation of the {quotecontinue} token.
+ */
+quotecontinuefail {whitespace}*"-"?
/* Bit string
*/
@@ -237,19 +254,11 @@ xdstop {dquote}
xddouble {dquote}{dquote}
xdinside [^"]+
-/* Unicode escapes */
-/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are
- * not needed here, but could be added if desired.)
- */
-uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
-
/* Quoted identifier with Unicode escapes */
xuistart [uU]&{dquote}
-xuistop {dquote}({whitespace}*{uescape})?
/* Quoted string with Unicode escapes */
xusstart [uU]&{quote}
-xusstop {quote}({whitespace}*{uescape})?
/* special stuff for C strings */
xdcqq \\\\
@@ -408,54 +417,58 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
{whitespace} {
/* ignore */
}
+} /* <SQL> */
+<C,SQL>{
{xcstart} {
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
xcdepth = 0;
- BEGIN(xcsql);
+ BEGIN(xc);
/* Put back any characters past slash-star; see above */
yyless(2);
fputs("/*", yyout);
}
-} /* <SQL> */
+} /* <C,SQL> */
-<C>{xcstart} {
- token_start = yytext;
- state_before = YYSTATE;
- xcdepth = 0;
- BEGIN(xcc);
- /* Put back any characters past slash-star; see above */
- yyless(2);
- fputs("/*", yyout);
- }
-<xcc>{xcstart} { ECHO; }
-<xcsql>{xcstart} {
- xcdepth++;
- /* Put back any characters past slash-star; see above */
- yyless(2);
- fputs("/_*", yyout);
- }
-<xcsql>{xcstop} {
- if (xcdepth <= 0)
+<xc>{
+{xcstart} {
+ if (state_before_str_start == SQL)
{
- ECHO;
- BEGIN(state_before);
- token_start = NULL;
+ xcdepth++;
+ /* Put back any characters past slash-star; see above */
+ yyless(2);
+ fputs("/_*", yyout);
}
- else
+ else if (state_before_str_start == C)
{
- xcdepth--;
- fputs("*_/", yyout);
+ ECHO;
}
}
-<xcc>{xcstop} {
- ECHO;
- BEGIN(state_before);
- token_start = NULL;
+
+{xcstop} {
+ if (state_before_str_start == SQL)
+ {
+ if (xcdepth <= 0)
+ {
+ ECHO;
+ BEGIN(SQL);
+ token_start = NULL;
+ }
+ else
+ {
+ xcdepth--;
+ fputs("*_/", yyout);
+ }
+ }
+ else if (state_before_str_start == C)
+ {
+ ECHO;
+ BEGIN(C);
+ token_start = NULL;
+ }
}
-<xcc,xcsql>{
{xcinside} {
ECHO;
}
@@ -471,7 +484,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
<<EOF>> {
mmfatal(PARSE_ERROR, "unterminated /* comment");
}
-} /* <xcc,xcsql> */
+} /* <xc> */
<SQL>{
{xbstart} {
@@ -482,23 +495,10 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
}
} /* <SQL> */
-<xb>{quotestop} |
-<xb>{quotefail} {
- yyless(1);
- BEGIN(SQL);
- if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
- mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
- base_yylval.str = mm_strdup(literalbuf);
- return BCONST;
- }
<xh>{xhinside} |
<xb>{xbinside} {
addlit(yytext, yyleng);
}
-<xh>{quotecontinue} |
-<xb>{quotecontinue} {
- /* ignore */
- }
<xb><<EOF>> { mmfatal(PARSE_ERROR, "unterminated bit string literal"); }
<SQL>{xhstart} {
@@ -507,19 +507,11 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
startlit();
addlitchar('x');
}
-<xh>{quotestop} |
-<xh>{quotefail} {
- yyless(1);
- BEGIN(SQL);
- base_yylval.str = mm_strdup(literalbuf);
- return XCONST;
- }
-
<xh><<EOF>> { mmfatal(PARSE_ERROR, "unterminated hexadecimal string literal"); }
<C>{xqstart} {
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xqc);
startlit();
}
@@ -530,59 +522,91 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
* Transfer it as-is to the backend.
*/
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xn);
startlit();
}
{xqstart} {
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xq);
startlit();
}
{xestart} {
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xe);
startlit();
}
{xusstart} {
token_start = yytext;
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xus);
startlit();
- addlit(yytext, yyleng);
}
} /* <SQL> */
-<xq,xqc>{quotestop} |
-<xq,xqc>{quotefail} {
- yyless(1);
- BEGIN(state_before);
- base_yylval.str = mm_strdup(literalbuf);
- return SCONST;
- }
-<xe>{quotestop} |
-<xe>{quotefail} {
- yyless(1);
- BEGIN(state_before);
- base_yylval.str = mm_strdup(literalbuf);
- return ECONST;
+<xb,xh,xq,xqc,xe,xn,xus>{quote} {
+ /*
+ * When we are scanning a quoted string and see an end
+ * quote, we must look ahead for a possible continuation.
+ * If we don't see one, we know the end quote was in fact
+ * the end of the string. To reduce the lexer table size,
+ * we use a single "xqs" state to do the lookahead for all
+ * types of strings.
+ */
+ state_before_str_stop = YYSTATE;
+ BEGIN(xqs);
}
-<xn>{quotestop} |
-<xn>{quotefail} {
- yyless(1);
- BEGIN(state_before);
- base_yylval.str = mm_strdup(literalbuf);
- return NCONST;
+<xqs>{quotecontinue} {
+ /*
+ * Found a quote continuation, so return to the in-quote
+ * state and continue scanning the literal. Nothing is
+ * added to the literal's contents.
+ */
+ BEGIN(state_before_str_stop);
}
-<xus>{xusstop} {
- addlit(yytext, yyleng);
- BEGIN(state_before);
- base_yylval.str = mm_strdup(literalbuf);
- return UCONST;
+<xqs>{quotecontinuefail} |
+<xqs>{other} |
+<xqs><<EOF>> {
+ /*
+ * Failed to see a quote continuation. Throw back
+ * everything after the end quote, and handle the string
+ * according to the state we were in previously.
+ */
+ yyless(0);
+ BEGIN(state_before_str_start);
+
+ switch (state_before_str_stop)
+ {
+ case xb:
+ if (literalbuf[strspn(literalbuf, "01") + 1] != '\0')
+ mmerror(PARSE_ERROR, ET_ERROR, "invalid bit string literal");
+ base_yylval.str = mm_strdup(literalbuf);
+ return BCONST;
+ case xh:
+ base_yylval.str = mm_strdup(literalbuf);
+ return XCONST;
+ case xq:
+ /* fallthrough */
+ case xqc:
+ base_yylval.str = psprintf("'%s'", literalbuf);
+ return SCONST;
+ case xe:
+ base_yylval.str = psprintf("E'%s'", literalbuf);
+ return SCONST;
+ case xn:
+ base_yylval.str = psprintf("N'%s'", literalbuf);
+ return SCONST;
+ case xus:
+ base_yylval.str = psprintf("U&'%s'", literalbuf);
+ return USCONST;
+ default:
+ mmfatal(PARSE_ERROR, "unhandled previous state in xqs\n");
+ }
}
+
<xq,xe,xn,xus>{xqdouble} { addlitchar('\''); }
<xqc>{xqcquote} {
addlitchar('\\');
@@ -604,9 +628,6 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
<xe>{xehexesc} {
addlit(yytext, yyleng);
}
-<xq,xqc,xe,xn,xus>{quotecontinue} {
- /* ignore */
- }
<xe>. {
/* This is only needed for \ just before EOF */
addlitchar(yytext[0]);
@@ -639,7 +660,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
dolqstart = NULL;
BEGIN(SQL);
base_yylval.str = mm_strdup(literalbuf);
- return DOLCONST;
+ return SCONST;
}
else
{
@@ -666,20 +687,19 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
<SQL>{
{xdstart} {
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xd);
startlit();
}
{xuistart} {
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xui);
startlit();
- addlit(yytext, yyleng);
}
} /* <SQL> */
<xd>{xdstop} {
- BEGIN(state_before);
+ BEGIN(state_before_str_start);
if (literallen == 0)
mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
/* The backend will truncate the identifier here. We do not as it does not change the result. */
@@ -687,17 +707,16 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
return CSTRING;
}
<xdc>{xdstop} {
- BEGIN(state_before);
+ BEGIN(state_before_str_start);
base_yylval.str = mm_strdup(literalbuf);
return CSTRING;
}
-<xui>{xuistop} {
- BEGIN(state_before);
+<xui>{dquote} {
+ BEGIN(state_before_str_start);
if (literallen == 2) /* "U&" */
mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier");
/* The backend will truncate the identifier here. We do not as it does not change the result. */
- addlit(yytext, yyleng);
- base_yylval.str = mm_strdup(literalbuf);
+ base_yylval.str = psprintf("U&\"%s\"", literalbuf);
return UIDENT;
}
<xd,xui>{xddouble} {
@@ -708,7 +727,7 @@ cppline {space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
}
<xd,xui><<EOF>> { mmfatal(PARSE_ERROR, "unterminated quoted identifier"); }
<C>{xdstart} {
- state_before = YYSTATE;
+ state_before_str_start = YYSTATE;
BEGIN(xdc);
startlit();
}
diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.c b/src/interfaces/ecpg/test/expected/preproc-strings.c
index 2053443e818..e695007b133 100644
--- a/src/interfaces/ecpg/test/expected/preproc-strings.c
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.c
@@ -45,7 +45,7 @@ int main(void)
#line 13 "strings.pgc"
- { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select 'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$", ECPGt_EOIT,
+ { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select 'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' UESCAPE '!' , $foo$abc$def$foo$", ECPGt_EOIT,
ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char),
ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L,
ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char),
diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.stderr b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
index 0478fd84aeb..dbc9e5c0b8d 100644
--- a/src/interfaces/ecpg/test/expected/preproc-strings.stderr
+++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr
@@ -8,7 +8,7 @@
[NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_process_output on line 13: OK: SET
[NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 15: query: select 'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 15: query: select 'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' UESCAPE '!' , $foo$abc$def$foo$; with 0 parameter(s) on connection ecpg1_regression
[NO_PID]: sqlca: code: 0, state: 00000
[NO_PID]: ecpg_execute on line 15: using PQexec
[NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/pl/plpgsql/src/pl_gram.y b/src/pl/plpgsql/src/pl_gram.y
index ef0a5d5d166..6778d0e7714 100644
--- a/src/pl/plpgsql/src/pl_gram.y
+++ b/src/pl/plpgsql/src/pl_gram.y
@@ -232,7 +232,7 @@ static void check_raise_parameters(PLpgSQL_stmt_raise *stmt);
* Some of these are not directly referenced in this file, but they must be
* here anyway.
*/
-%token <str> IDENT FCONST SCONST BCONST XCONST Op
+%token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op
%token <ival> ICONST PARAM
%token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER
%token LESS_EQUALS GREATER_EQUALS NOT_EQUALS
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 6d96843e5b5..60cb86193c7 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -48,17 +48,21 @@ SELECT 'tricky' AS U&"\" UESCAPE '!';
(1 row)
SELECT U&'wrong: \061';
-ERROR: invalid Unicode escape value at or near "\061'"
+ERROR: invalid Unicode escape value
LINE 1: SELECT U&'wrong: \061';
^
SELECT U&'wrong: \+0061';
-ERROR: invalid Unicode escape value at or near "\+0061'"
+ERROR: invalid Unicode escape value
LINE 1: SELECT U&'wrong: \+0061';
^
+SELECT U&'wrong: +0061' UESCAPE +;
+ERROR: UESCAPE must be followed by a simple string literal at or near "+"
+LINE 1: SELECT U&'wrong: +0061' UESCAPE +;
+ ^
SELECT U&'wrong: +0061' UESCAPE '+';
-ERROR: invalid Unicode escape character at or near "+'"
+ERROR: invalid Unicode escape character at or near "'+'"
LINE 1: SELECT U&'wrong: +0061' UESCAPE '+';
- ^
+ ^
SET standard_conforming_strings TO off;
SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061";
ERROR: unsafe use of string constant with Unicode escapes
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 0afb94964b1..c5cd15142a5 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -27,6 +27,7 @@ SELECT 'tricky' AS U&"\" UESCAPE '!';
SELECT U&'wrong: \061';
SELECT U&'wrong: \+0061';
+SELECT U&'wrong: +0061' UESCAPE +;
SELECT U&'wrong: +0061' UESCAPE '+';
SET standard_conforming_strings TO off;