1 files changed, 665 insertions, 0 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
new file mode 100644
index 00000000000..cbb81d1bf37
--- /dev/null
+++ b/src/backend/utils/adt/json.c
@@ -0,0 +1,665 @@
+/*-------------------------------------------------------------------------
+ *
+ * json.c
+ *		JSON data type support.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/json.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/json.h"
+
+typedef enum
+{
+	JSON_VALUE_INVALID,
+	JSON_VALUE_STRING,
+	JSON_VALUE_NUMBER,
+	JSON_VALUE_OBJECT,
+	JSON_VALUE_ARRAY,
+	JSON_VALUE_TRUE,
+	JSON_VALUE_FALSE,
+	JSON_VALUE_NULL
+} JsonValueType;
+
+typedef struct
+{
+	char	   *input;
+	char	   *token_start;
+	char	   *token_terminator;
+	JsonValueType	token_type;
+	int			line_number;
+	char	   *line_start;
+} JsonLexContext;
+
+typedef enum
+{
+	JSON_PARSE_VALUE,			/* expecting a value */
+	JSON_PARSE_ARRAY_START,		/* saw '[', expecting value or ']' */
+	JSON_PARSE_ARRAY_NEXT,		/* saw array element, expecting ',' or ']' */
+	JSON_PARSE_OBJECT_START,	/* saw '{', expecting label or '}' */
+	JSON_PARSE_OBJECT_LABEL,	/* saw object label, expecting ':' */
+	JSON_PARSE_OBJECT_NEXT,		/* saw object value, expecting ',' or '}' */
+	JSON_PARSE_OBJECT_COMMA		/* saw object ',', expecting next label */
+} JsonParseState;
+
+typedef struct JsonParseStack
+{
+	JsonParseState	state;
+} JsonParseStack;
+
+typedef enum
+{
+	JSON_STACKOP_NONE,
+	JSON_STACKOP_PUSH,
+	JSON_STACKOP_PUSH_WITH_PUSHBACK,
+	JSON_STACKOP_POP
+} JsonStackOp;
+
+static void json_validate_cstring(char *input);
+static void json_lex(JsonLexContext *lex);
+static void json_lex_string(JsonLexContext *lex);
+static void json_lex_number(JsonLexContext *lex, char *s);
+static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex);
+static void report_invalid_token(JsonLexContext *lex);
+static char *extract_mb_char(char *s);
+
+extern Datum json_in(PG_FUNCTION_ARGS);
+
+/*
+ * Input.
+ */
+Datum
+json_in(PG_FUNCTION_ARGS)
+{
+	char    *text = PG_GETARG_CSTRING(0);
+
+	json_validate_cstring(text);
+
+	PG_RETURN_TEXT_P(cstring_to_text(text));
+}
+
+/*
+ * Output.
+ */
+Datum
+json_out(PG_FUNCTION_ARGS)
+{
+	Datum	txt = PG_GETARG_DATUM(0);
+
+	PG_RETURN_CSTRING(TextDatumGetCString(txt));
+}
+
+/*
+ * Binary send.
+ */
+Datum
+json_send(PG_FUNCTION_ARGS)
+{
+	StringInfoData buf;
+	text   *t = PG_GETARG_TEXT_PP(0);
+
+	pq_begintypsend(&buf);
+	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+/*
+ * Binary receive.
+ */
+Datum
+json_recv(PG_FUNCTION_ARGS)
+{
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	text	   *result;
+	char	   *str;
+	int			nbytes;
+
+	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
+
+	/*
+	 * We need a null-terminated string to pass to json_validate_cstring().
+	 * Rather than make a separate copy, make the temporary result one byte
+	 * bigger than it needs to be.
+	 */
+	result = palloc(nbytes + 1 + VARHDRSZ);
+	SET_VARSIZE(result, nbytes + VARHDRSZ);
+	memcpy(VARDATA(result), str, nbytes);
+	str = VARDATA(result);
+	str[nbytes] = '\0';
+
+	/* Validate it. */
+	json_validate_cstring(str);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+/*
+ * Check whether supplied input is valid JSON.
+ */
+static void
+json_validate_cstring(char *input)
+{
+	JsonLexContext	lex;
+	JsonParseStack *stack,
+				   *stacktop;
+	int				stacksize;
+
+	/* Set up lexing context. */
+	lex.input = input;
+	lex.token_terminator = lex.input;
+	lex.line_number = 1;
+	lex.line_start = input;
+
+	/* Set up parse stack. */
+	stacksize = 32;
+	stacktop = palloc(sizeof(JsonParseStack) * stacksize);
+	stack = stacktop;
+	stack->state = JSON_PARSE_VALUE;
+
+	/* Main parsing loop. */
+	for (;;)
+	{
+		JsonStackOp	op;
+
+		/* Fetch next token. */
+		json_lex(&lex);
+
+		/* Check for unexpected end of input. */
+		if (lex.token_start == NULL)
+			report_parse_error(stack, &lex);
+
+redo:
+		/* Figure out what to do with this token. */
+		op = JSON_STACKOP_NONE;
+		switch (stack->state)
+		{
+			case JSON_PARSE_VALUE:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == '[')
+					stack->state = JSON_PARSE_ARRAY_START;
+				else if (lex.token_start[0] == '{')
+					stack->state = JSON_PARSE_OBJECT_START;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_ARRAY_START:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					stack->state = JSON_PARSE_ARRAY_NEXT;
+				else if (lex.token_start[0] == ']')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == '['
+					|| lex.token_start[0] == '{')
+				{
+					stack->state = JSON_PARSE_ARRAY_NEXT;
+					op = JSON_STACKOP_PUSH_WITH_PUSHBACK;
+				}
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_ARRAY_NEXT:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					report_parse_error(stack, &lex);
+				else if (lex.token_start[0] == ']')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == ',')
+					op = JSON_STACKOP_PUSH;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_START:
+				if (lex.token_type == JSON_VALUE_STRING)
+					stack->state = JSON_PARSE_OBJECT_LABEL;
+				else if (lex.token_type == JSON_VALUE_INVALID
+					&& lex.token_start[0] == '}')
+					op = JSON_STACKOP_POP;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_LABEL:
+				if (lex.token_type == JSON_VALUE_INVALID
+					&& lex.token_start[0] == ':')
+				{
+					stack->state = JSON_PARSE_OBJECT_NEXT;
+					op = JSON_STACKOP_PUSH;
+				}
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_NEXT:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					report_parse_error(stack, &lex);
+				else if (lex.token_start[0] == '}')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == ',')
+					stack->state = JSON_PARSE_OBJECT_COMMA;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_COMMA:
+				if (lex.token_type == JSON_VALUE_STRING)
+					stack->state = JSON_PARSE_OBJECT_LABEL;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			default:
+				elog(ERROR, "unexpected json parse state: %d",
+						(int) stack->state);
+		}
+
+		/* Push or pop the stack, if needed. */
+		switch (op)
+		{
+			case JSON_STACKOP_PUSH:
+			case JSON_STACKOP_PUSH_WITH_PUSHBACK:
+				++stack;
+				if (stack >= &stacktop[stacksize])
+				{
+					int		stackoffset = stack - stacktop;
+					stacksize = stacksize + 32;
+					stacktop = repalloc(stacktop,
+										sizeof(JsonParseStack) * stacksize);
+					stack = stacktop + stackoffset;
+				}
+				stack->state = JSON_PARSE_VALUE;
+				if (op == JSON_STACKOP_PUSH_WITH_PUSHBACK)
+					goto redo;
+				break;
+			case JSON_STACKOP_POP:
+				if (stack == stacktop)
+				{
+					/* Expect end of input. */
+					json_lex(&lex);
+					if (lex.token_start != NULL)
+						report_parse_error(NULL, &lex);
+					return;
+				}
+				--stack;
+				break;
+			case JSON_STACKOP_NONE:
+				/* nothing to do */
+				break;
+		}
+	}
+}
+
+/*
+ * Lex one token from the input stream.
+ */
+static void
+json_lex(JsonLexContext *lex)
+{
+	char	   *s;
+
+	/* Skip leading whitespace. */
+	s = lex->token_terminator;
+	while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+	{
+		if (*s == '\n')
+			++lex->line_number;
+		++s;
+	}
+	lex->token_start = s;
+
+	/* Determine token type. */
+	if (strchr("{}[],:", s[0]))
+	{
+		/* strchr() doesn't return false on a NUL input. */
+		if (s[0] == '\0')
+		{
+			/* End of string. */
+			lex->token_start = NULL;
+			lex->token_terminator = NULL;
+		}
+		else
+		{
+			/* Single-character token, some kind of punctuation mark. */
+			lex->token_terminator = s + 1;
+		}
+		lex->token_type = JSON_VALUE_INVALID;
+	}
+	else if (*s == '"')
+	{
+		/* String. */
+		json_lex_string(lex);
+		lex->token_type = JSON_VALUE_STRING;
+	}
+	else if (*s == '-')
+	{
+		/* Negative number. */
+		json_lex_number(lex, s + 1);
+		lex->token_type = JSON_VALUE_NUMBER;
+	}
+	else if (*s >= '0' && *s <= '9')
+	{
+		/* Positive number. */
+		json_lex_number(lex, s);
+		lex->token_type = JSON_VALUE_NUMBER;
+	}
+	else
+	{
+		char   *p;
+
+		/*
+		 * We're not dealing with a string, number, legal punctuation mark,
+		 * or end of string.  The only legal tokens we might find here are
+		 * true, false, and null, but for error reporting purposes we scan
+		 * until we see a non-alphanumeric character.  That way, we can report
+		 * the whole word as an unexpected token, rather than just some
+		 * unintuitive prefix thereof.
+		 */
+ 		for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')
+			|| (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p);
+			++p)
+			;
+
+		/*
+		 * We got some sort of unexpected punctuation or an otherwise
+		 * unexpected character, so just complain about that one character.
+		 */
+		if (p == s)
+		{
+			lex->token_terminator = s + 1;
+			report_invalid_token(lex);
+		}
+
+		/*
+		 * We've got a real alphanumeric token here.  If it happens to be
+		 * true, false, or null, all is well.  If not, error out.
+		 */
+		lex->token_terminator = p;
+		if (p - s == 4)
+		{
+			if (memcmp(s, "true", 4) == 0)
+				lex->token_type = JSON_VALUE_TRUE;
+			else if (memcmp(s, "null", 4) == 0)
+				lex->token_type = JSON_VALUE_NULL;
+			else
+				report_invalid_token(lex);
+		}
+		else if (p - s == 5 && memcmp(s, "false", 5) == 0)
+			lex->token_type = JSON_VALUE_FALSE;
+		else
+			report_invalid_token(lex);
+	}
+}
+
+/*
+ * The next token in the input stream is known to be a string; lex it.
+ */
+static void
+json_lex_string(JsonLexContext *lex)
+{
+	char	   *s = lex->token_start + 1;
+
+	for (s = lex->token_start + 1; *s != '"'; ++s)
+	{
+		/* Per RFC4627, these characters MUST be escaped. */
+		if (*s < 32)
+		{
+			/* A NUL byte marks the (premature) end of the string. */
+			if (*s == '\0')
+			{
+				lex->token_terminator = s;
+				report_invalid_token(lex);
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type json"),
+					 errdetail_internal("line %d: Character \"%c\" must be escaped.",
+						lex->line_number, *s)));
+		}
+		else if (*s == '\\')
+		{
+			/* OK, we have an escape character. */
+			++s;
+			if (*s == '\0')
+			{
+				lex->token_terminator = s;
+				report_invalid_token(lex);
+			}
+			else if (*s == 'u')
+			{
+				int		i;
+				int		ch = 0;
+
+				for (i = 1; i <= 4; ++i)
+				{
+					if (s[i] == '\0')
+					{
+						lex->token_terminator = s + i;
+						report_invalid_token(lex);
+					}
+					else if (s[i] >= '0' && s[i] <= '9')
+						ch = (ch * 16) + (s[i] - '0');
+					else if (s[i] >= 'a' && s[i] <= 'f')
+						ch = (ch * 16) + (s[i] - 'a') + 10;
+					else if (s[i] >= 'A' && s[i] <= 'F')
+						ch = (ch * 16) + (s[i] - 'A') + 10;
+					else
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								 errmsg("invalid input syntax for type json"),
+								 errdetail_internal("line %d: \"\\u\" must be followed by four hexadecimal digits.",
+									lex->line_number)));
+					}
+				}
+
+				/* Account for the four additional bytes we just parsed. */
+				s += 4;
+			}
+			else if (!strchr("\"\\/bfnrt", *s))
+			{
+				/* Error out. */
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						 errmsg("invalid input syntax for type json"),
+						 errdetail_internal("line %d: Invalid escape \"\\%s\".",
+							lex->line_number, extract_mb_char(s))));
+			}
+		}
+	}
+
+	/* Hooray, we found the end of the string! */
+	lex->token_terminator = s + 1;
+}
+
+/*-------------------------------------------------------------------------
+ * The next token in the input stream is known to be a number; lex it.
+ *
+ * In JSON, a number consists of four parts:
+ *
+ * (1) An optional minus sign ('-').
+ *
+ * (2) Either a single '0', or a string of one or more digits that does not
+ *     begin with a '0'.
+ *
+ * (3) An optional decimal part, consisting of a period ('.') followed by
+ *     one or more digits.  (Note: While this part can be omitted
+ *     completely, it's not OK to have only the decimal point without
+ *     any digits afterwards.)
+ *
+ * (4) An optional exponent part, consisting of 'e' or 'E', optionally
+ *     followed by '+' or '-', followed by one or more digits.  (Note:
+ *     As with the decimal part, if 'e' or 'E' is present, it must be
+ *     followed by at least one digit.)
+ *
+ * The 's' argument to this function points to the ostensible beginning
+ * of part 2 - i.e. the character after any optional minus sign, and the
+ * first character of the string if there is none.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+json_lex_number(JsonLexContext *lex, char *s)
+{
+	bool	error = false;
+	char   *p;
+
+	/* Part (1): leading sign indicator. */
+	/* Caller already did this for us; so do nothing. */
+
+	/* Part (2): parse main digit string. */
+	if (*s == '0')
+		++s;
+	else if (*s >= '1' && *s <= '9')
+	{
+		do
+		{
+			++s;
+		} while (*s >= '0' && *s <= '9');
+	}
+	else
+		error = true;
+
+	/* Part (3): parse optional decimal portion. */
+	if (*s == '.')
+	{
+		++s;
+		if (*s < '0' && *s > '9')
+			error = true;
+		else
+		{
+			do
+			{
+				++s;
+			} while (*s >= '0' && *s <= '9');
+		}
+	}
+
+	/* Part (4): parse optional exponent. */
+	if (*s == 'e' || *s == 'E')
+	{
+		++s;
+		if (*s == '+' || *s == '-')
+			++s;
+		if (*s < '0' && *s > '9')
+			error = true;
+		else
+		{
+			do
+			{
+				++s;
+			} while (*s >= '0' && *s <= '9');
+		}
+	}
+
+	/* Check for trailing garbage. */
+	for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')
+		|| (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); ++p)
+		;
+	lex->token_terminator = p;
+	if (p > s || error)
+		report_invalid_token(lex);
+}
+
+/*
+ * Report a parse error.
+ */
+static void
+report_parse_error(JsonParseStack *stack, JsonLexContext *lex)
+{
+	char   *detail = NULL;
+	char   *token = NULL;
+	int		toklen;
+
+	/* Handle case where the input ended prematurely. */
+	if (lex->token_start == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type json: \"%s\"",
+					lex->input),
+	 			 errdetail_internal("The input string ended unexpectedly.")));
+
+	/* Work out the offending token. */
+	toklen = lex->token_terminator - lex->token_start;
+	token = palloc(toklen + 1);
+	memcpy(token, lex->token_start, toklen);
+	token[toklen] = '\0';
+
+	/* Select correct detail message. */
+	if (stack == NULL)
+		detail = "line %d: Expected end of input, but found \"%s\".";
+	else
+	{
+		switch (stack->state)
+		{
+			case JSON_PARSE_VALUE:
+				detail = "line %d: Expected string, number, object, array, true, false, or null, but found \"%s\".";
+				break;
+			case JSON_PARSE_ARRAY_START:
+				detail = "line %d: Expected array element or \"]\", but found \"%s\".";
+				break;
+			case JSON_PARSE_ARRAY_NEXT:
+				detail = "line %d: Expected \",\" or \"]\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_START:
+				detail = "line %d: Expected string or \"}\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_LABEL:
+				detail = "line %d: Expected \":\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_NEXT:
+				detail = "line %d: Expected \",\" or \"}\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_COMMA:
+				detail = "line %d: Expected string, but found \"%s\".";
+				break;
+		}
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("invalid input syntax for type json: \"%s\"",
+				lex->input),
+ 			 errdetail_internal(detail, lex->line_number, token)));
+}
+
+/*
+ * Report an invalid input token.
+ */
+static void
+report_invalid_token(JsonLexContext *lex)
+{
+	char   *token;
+	int		toklen;
+
+	toklen = lex->token_terminator - lex->token_start;
+	token = palloc(toklen + 1);
+	memcpy(token, lex->token_start, toklen);
+	token[toklen] = '\0';
+
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("invalid input syntax for type json"),
+			 errdetail_internal("line %d: Token \"%s\" is invalid.",
+				lex->line_number, token)));
+}
+
+/*
+ * Extract a single, possibly multi-byte char from the input string.
+ */
+static char *
+extract_mb_char(char *s)
+{
+	char   *res;
+	int		len;
+
+	len = pg_mblen(s);
+	res = palloc(len + 1);
+	memcpy(res, s, len);
+	res[len] = '\0';
+
+	return res;
+}