Improve behavior of to_timestamp()/to_date() functions

to_timestamp()/to_date() functions were introduced mainly for Oracle compatibility, and became very popular among PostgreSQL users. However, some behavior of to_timestamp()/to_date() functions are both incompatible with Oracle and confusing for our users. This behavior is related to handling of spaces and separators in non FX (fixed format) mode. This commit reworks this behavior making less confusing, better documented and more compatible with Oracle. Nevertheless, there are still following incompatibilities with Oracle. 1) We don't insist that there are no format string patterns unmatched to input string. 2) In FX mode we don't insist space and separators in format string to exactly match input string. 3) When format string patterns are divided by mix of spaces and separators, we don't distinguish them, while Oracle takes into account only last group of spaces/separators. Discussion: https://postgr.es/m/1873520224.1784572.1465833145330.JavaMail.yahoo%40mail.yahoo.com Author: Artur Zakirov, Alexander Korotkov, Liudmila Mantrova Review: Amul Sul, Robert Haas, Tom Lane, Dmitry Dolgov, David G. Johnston
author: Alexander Korotkov <akorotkov@postgresql.org> 2018-09-09 21:19:51 +0300
committer: Alexander Korotkov <akorotkov@postgresql.org> 2018-09-09 21:19:51 +0300
commit: cf984672427ebc4446213fe8f71d8899452802b6 (patch)
tree: 034f104000ecda901d2ab7a9f459d250d1a7406f /src/backend/utils/adt/formatting.c
parent: 5f08accdad2b03e0107bdc73d48783a01fe51c8c (diff)
download: postgresql-cf984672427ebc4446213fe8f71d8899452802b6.tar.gz
postgresql-cf984672427ebc4446213fe8f71d8899452802b6.zip
1 files changed, 98 insertions, 14 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 30696e3575d..2ed8ca675bd 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -165,6 +165,8 @@ typedef struct
 #define NODE_TYPE_END		1
 #define NODE_TYPE_ACTION	2
 #define NODE_TYPE_CHAR		3
+#define NODE_TYPE_SEPARATOR	4
+#define NODE_TYPE_SPACE		5
 
 #define SUFFTYPE_PREFIX		1
 #define SUFFTYPE_POSTFIX	2
@@ -955,6 +957,7 @@ typedef struct NUMProc
 static const KeyWord *index_seq_search(const char *str, const KeyWord *kw,
 				 const int *index);
 static const KeySuffix *suff_search(const char *str, const KeySuffix *suf, int type);
+static bool is_separator_char(const char *str);
 static void NUMDesc_prepare(NUMDesc *num, FormatNode *n);
 static void parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 			 const KeySuffix *suf, const int *index, int ver, NUMDesc *Num);
@@ -1044,6 +1047,16 @@ suff_search(const char *str, const KeySuffix *suf, int type)
 	return NULL;
 }
 
+static bool
+is_separator_char(const char *str)
+{
+	/* ASCII printable character, but not letter or digit */
+	return (*str > 0x20 && *str < 0x7F &&
+			!(*str >= 'A' && *str <= 'Z') &&
+			!(*str >= 'a' && *str <= 'z') &&
+			!(*str >= '0' && *str <= '9'));
+}
+
 /* ----------
  * Prepare NUMDesc (number description struct) via FormatNode struct
  * ----------
@@ -1319,7 +1332,14 @@ parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 				if (*str == '\\' && *(str + 1) == '"')
 					str++;
 				chlen = pg_mblen(str);
-				n->type = NODE_TYPE_CHAR;
+
+				if (ver == DCH_TYPE && is_separator_char(str))
+					n->type = NODE_TYPE_SEPARATOR;
+				else if (isspace((unsigned char) *str))
+					n->type = NODE_TYPE_SPACE;
+				else
+					n->type = NODE_TYPE_CHAR;
+
 				memcpy(n->character, str, chlen);
 				n->character[chlen] = '\0';
 				n->key = NULL;
@@ -2987,27 +3007,66 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 	int			len,
 				value;
 	bool		fx_mode = false;
+	/* number of extra skipped characters (more than given in format string) */
+	int			extra_skip = 0;
 
 	for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
 	{
-		if (n->type != NODE_TYPE_ACTION)
+		/*
+		 * Ignore spaces at the beginning of the string and before fields when
+		 * not in FX (fixed width) mode.
+		 */
+		if (!fx_mode && (n->type != NODE_TYPE_ACTION || n->key->id != DCH_FX) &&
+			(n->type == NODE_TYPE_ACTION || n == node))
+		{
+			while (*s != '\0' && isspace((unsigned char) *s))
+			{
+				s++;
+				extra_skip++;
+			}
+		}
+
+		if (n->type == NODE_TYPE_SPACE || n->type == NODE_TYPE_SEPARATOR)
+		{
+			if (!fx_mode)
+			{
+				/*
+				 * In non FX (fixed format) mode one format string space or
+				 * separator match to one space or separator in input string.
+				 * Or match nothing if there is no space or separator in
+				 * the current position of input string.
+				 */
+				extra_skip--;
+				if (isspace((unsigned char) *s) || is_separator_char(s))
+				{
+					s++;
+					extra_skip++;
+				}
+			}
+			else
+			{
+				/*
+				 * In FX mode, on format string space or separator we consume
+				 * exactly one character from input string.  Notice we don't
+				 * insist that the consumed character match the format's
+				 * character.
+				 */
+				s += pg_mblen(s);
+			}
+			continue;
+		}
+		else if (n->type != NODE_TYPE_ACTION)
 		{
 			/*
-			 * Separator, so consume one character from input string.  Notice
-			 * we don't insist that the consumed character match the format's
-			 * character.
+			 * Text character, so consume one character from input string.
+			 * Notice we don't insist that the consumed character match the
+			 * format's character.
+			 * Text field ignores FX mode.
 			 */
 			s += pg_mblen(s);
 			continue;
 		}
 
-		/* Ignore spaces before fields when not in FX (fixed width) mode */
-		if (!fx_mode && n->key->id != DCH_FX)
-		{
-			while (*s != '\0' && isspace((unsigned char) *s))
-				s++;
-		}
-
 		from_char_set_mode(out, n->key->date_mode);
 
 		switch (n->key->id)
@@ -3086,10 +3145,24 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 								n->key->name)));
 				break;
 			case DCH_TZH:
-				out->tzsign = *s == '-' ? -1 : +1;
-
+				/*
+				 * Value of TZH might be negative.  And the issue is that we
+				 * might swallow minus sign as the separator.  So, if we have
+				 * skipped more characters than specified in the format string,
+				 * then we consider prepending last skipped minus to TZH.
+				 */
 				if (*s == '+' || *s == '-' || *s == ' ')
+				{
+					out->tzsign = *s == '-' ? -1 : +1;
 					s++;
+				}
+				else
+				{
+					if (extra_skip > 0 && *(s - 1) == '-')
+						out->tzsign = -1;
+					else
+						out->tzsign = +1;
+				}
 
 				from_char_parse_int_len(&out->tzh, &s, 2, n);
 				break;
@@ -3261,6 +3334,17 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 				SKIP_THth(s, n->suffix);
 				break;
 		}
+
+		/* Ignore all spaces after fields */
+		if (!fx_mode)
+		{
+			extra_skip = 0;
+			while (*s != '\0' && isspace((unsigned char) *s))
+			{
+				s++;
+				extra_skip++;
+			}
+		}
 	}
 }
author	Alexander Korotkov <akorotkov@postgresql.org>	2018-09-09 21:19:51 +0300
committer	Alexander Korotkov <akorotkov@postgresql.org>	2018-09-09 21:19:51 +0300
commit	cf984672427ebc4446213fe8f71d8899452802b6 (patch)
tree	034f104000ecda901d2ab7a9f459d250d1a7406f /src/backend/utils/adt/formatting.c
parent	5f08accdad2b03e0107bdc73d48783a01fe51c8c (diff)
download	postgresql-cf984672427ebc4446213fe8f71d8899452802b6.tar.gz postgresql-cf984672427ebc4446213fe8f71d8899452802b6.zip