aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2017-11-18 12:42:52 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2017-11-18 12:42:52 -0500
commit976a1a48fc35cde3c750982be64f872c4de4d343 (patch)
treede499afa5cf118d8e5615b23bd4bf22be286c8bf /src
parent63ca86318dc3d6a768eed78efbc6ca014a0622a8 (diff)
downloadpostgresql-976a1a48fc35cde3c750982be64f872c4de4d343.tar.gz
postgresql-976a1a48fc35cde3c750982be64f872c4de4d343.zip
Improve to_date/to_number/to_timestamp behavior with multibyte characters.
The documentation says that these functions skip one input character per literal (non-pattern) format character. Actually, though, they skipped one input *byte* per literal *byte*, which could be hugely confusing if either data or format contained multibyte characters. To fix, adjust the FormatNode representation and parse_format() so that multibyte format characters are stored as one FormatNode not several, and adjust the data-skipping bits to advance by pg_mblen() not necessarily one byte. There's no user-visible behavior change on the to_char() side, although the internal representation changes. Commit e87d4965b had already fixed most places where we skip characters on the basis of non-literal format patterns to advance by characters not bytes, but this gets one more place, the SKIP_THth macro. I think everything in formatting.c gets that right now. It'd be nice to have some regression test cases covering this behavior; but of course there's no way to do so in an encoding-agnostic way, and many of the interesting aspects would also require unportable locale selections. So I've not bothered here. Discussion: https://postgr.es/m/28186.1510957703@sss.pgh.pa.us
Diffstat (limited to 'src')
-rw-r--r--src/backend/utils/adt/formatting.c68
1 files changed, 41 insertions, 27 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index cb0dbf748e5..ec97de0ad27 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -151,8 +151,6 @@ typedef enum
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
} FromCharDateMode;
-typedef struct FormatNode FormatNode;
-
typedef struct
{
const char *name;
@@ -162,13 +160,13 @@ typedef struct
FromCharDateMode date_mode;
} KeyWord;
-struct FormatNode
+typedef struct
{
- int type; /* node type */
- const KeyWord *key; /* if node type is KEYWORD */
- char character; /* if node type is CHAR */
- int suffix; /* keyword suffix */
-};
+ int type; /* NODE_TYPE_XXX, see below */
+ const KeyWord *key; /* if type is ACTION */
+ char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */
+ int suffix; /* keyword prefix/suffix code, if any */
+} FormatNode;
#define NODE_TYPE_END 1
#define NODE_TYPE_ACTION 2
@@ -1282,12 +1280,15 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
}
else if (*str)
{
+ int chlen;
+
/*
* Process double-quoted literal string, if any
*/
if (*str == '"')
{
- while (*(++str))
+ str++;
+ while (*str)
{
if (*str == '"')
{
@@ -1297,11 +1298,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
/* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1))
str++;
+ chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR;
- n->character = *str;
+ memcpy(n->character, str, chlen);
+ n->character[chlen] = '\0';
n->key = NULL;
n->suffix = 0;
n++;
+ str += chlen;
}
}
else
@@ -1312,12 +1316,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
*/
if (*str == '\\' && *(str + 1) == '"')
str++;
+ chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR;
- n->character = *str;
+ memcpy(n->character, str, chlen);
+ n->character[chlen] = '\0';
n->key = NULL;
n->suffix = 0;
n++;
- str++;
+ str += chlen;
}
}
}
@@ -1349,7 +1355,8 @@ dump_node(FormatNode *node, int max)
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
else if (n->type == NODE_TYPE_CHAR)
- elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
+ elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
+ a, n->character);
else if (n->type == NODE_TYPE_END)
{
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
@@ -2008,8 +2015,8 @@ asc_toupper_z(const char *buff)
do { \
if (S_THth(_suf)) \
{ \
- if (*(ptr)) (ptr)++; \
- if (*(ptr)) (ptr)++; \
+ if (*(ptr)) (ptr) += pg_mblen(ptr); \
+ if (*(ptr)) (ptr) += pg_mblen(ptr); \
} \
} while (0)
@@ -2076,7 +2083,8 @@ is_next_separator(FormatNode *n)
return true;
}
- else if (isdigit((unsigned char) n->character))
+ else if (n->character[1] == '\0' &&
+ isdigit((unsigned char) n->character[0]))
return false;
return true; /* some non-digit input (separator) */
@@ -2405,8 +2413,8 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
{
if (n->type != NODE_TYPE_ACTION)
{
- *s = n->character;
- s++;
+ strcpy(s, n->character);
+ s += strlen(s);
continue;
}
@@ -2974,7 +2982,7 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
* we don't insist that the consumed character match the format's
* character.
*/
- s++;
+ s += pg_mblen(s);
continue;
}
@@ -4217,7 +4225,7 @@ get_last_relevant_decnum(char *num)
/*
* These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string
- * AMOUNT_TEST(s): true if at least s characters remain in string
+ * AMOUNT_TEST(s): true if at least s bytes remain in string
*/
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
@@ -4821,9 +4829,9 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
if (!Np->is_to_char)
{
/*
- * Check at least one character remains to be scanned. (In
- * actions below, must use AMOUNT_TEST if we want to read more
- * characters than that.)
+ * Check at least one byte remains to be scanned. (In actions
+ * below, must use AMOUNT_TEST if we want to read more bytes than
+ * that.)
*/
if (OVERLOAD_TEST)
break;
@@ -5081,12 +5089,18 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
* In TO_CHAR, non-pattern characters in the format are copied to
* the output. In TO_NUMBER, we skip one input character for each
* non-pattern format character, whether or not it matches the
- * format character. (Currently, that's actually implemented as
- * skipping one input byte per non-pattern format byte, which is
- * wrong...)
+ * format character.
*/
if (Np->is_to_char)
- *Np->inout_p = n->character;
+ {
+ strcpy(Np->inout_p, n->character);
+ Np->inout_p += strlen(Np->inout_p);
+ }
+ else
+ {
+ Np->inout_p += pg_mblen(Np->inout_p);
+ }
+ continue;
}
Np->inout_p++;
}