Built-in JSON data type.

Like the XML data type, we simply store JSON data as text, after checking that it is valid. More complex operations such as canonicalization and comparison may come later, but this is enough for not. There are a few open issues here, such as whether we should attempt to detect UTF-8 surrogate pairs represented as \uXXXX\uYYYY, but this gets the basic framework in place.
author: Robert Haas <rhaas@postgresql.org> 2012-01-31 11:48:23 -0500
committer: Robert Haas <rhaas@postgresql.org> 2012-01-31 11:48:23 -0500
commit: 5384a73f98d9829725186a7b65bf4f8adb3cfaf1 (patch)
tree: 7d0906859b26e82bb5d54a5297c072b708f82dbd /src/backend/utils/adt/json.c
parent: 4c6cedd1b014abf2046886a9a92e10e18f0d658e (diff)
download: postgresql-5384a73f98d9829725186a7b65bf4f8adb3cfaf1.tar.gz
postgresql-5384a73f98d9829725186a7b65bf4f8adb3cfaf1.zip
1 files changed, 665 insertions, 0 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
new file mode 100644
index 00000000000..cbb81d1bf37
--- /dev/null
+++ b/src/backend/utils/adt/json.c
@@ -0,0 +1,665 @@
+/*-------------------------------------------------------------------------
+ *
+ * json.c
+ *		JSON data type support.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/json.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "lib/stringinfo.h"
+#include "libpq/pqformat.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+#include "utils/json.h"
+
+typedef enum
+{
+	JSON_VALUE_INVALID,
+	JSON_VALUE_STRING,
+	JSON_VALUE_NUMBER,
+	JSON_VALUE_OBJECT,
+	JSON_VALUE_ARRAY,
+	JSON_VALUE_TRUE,
+	JSON_VALUE_FALSE,
+	JSON_VALUE_NULL
+} JsonValueType;
+
+typedef struct
+{
+	char	   *input;
+	char	   *token_start;
+	char	   *token_terminator;
+	JsonValueType	token_type;
+	int			line_number;
+	char	   *line_start;
+} JsonLexContext;
+
+typedef enum
+{
+	JSON_PARSE_VALUE,			/* expecting a value */
+	JSON_PARSE_ARRAY_START,		/* saw '[', expecting value or ']' */
+	JSON_PARSE_ARRAY_NEXT,		/* saw array element, expecting ',' or ']' */
+	JSON_PARSE_OBJECT_START,	/* saw '{', expecting label or '}' */
+	JSON_PARSE_OBJECT_LABEL,	/* saw object label, expecting ':' */
+	JSON_PARSE_OBJECT_NEXT,		/* saw object value, expecting ',' or '}' */
+	JSON_PARSE_OBJECT_COMMA		/* saw object ',', expecting next label */
+} JsonParseState;
+
+typedef struct JsonParseStack
+{
+	JsonParseState	state;
+} JsonParseStack;
+
+typedef enum
+{
+	JSON_STACKOP_NONE,
+	JSON_STACKOP_PUSH,
+	JSON_STACKOP_PUSH_WITH_PUSHBACK,
+	JSON_STACKOP_POP
+} JsonStackOp;
+
+static void json_validate_cstring(char *input);
+static void json_lex(JsonLexContext *lex);
+static void json_lex_string(JsonLexContext *lex);
+static void json_lex_number(JsonLexContext *lex, char *s);
+static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex);
+static void report_invalid_token(JsonLexContext *lex);
+static char *extract_mb_char(char *s);
+
+extern Datum json_in(PG_FUNCTION_ARGS);
+
+/*
+ * Input.
+ */
+Datum
+json_in(PG_FUNCTION_ARGS)
+{
+	char    *text = PG_GETARG_CSTRING(0);
+
+	json_validate_cstring(text);
+
+	PG_RETURN_TEXT_P(cstring_to_text(text));
+}
+
+/*
+ * Output.
+ */
+Datum
+json_out(PG_FUNCTION_ARGS)
+{
+	Datum	txt = PG_GETARG_DATUM(0);
+
+	PG_RETURN_CSTRING(TextDatumGetCString(txt));
+}
+
+/*
+ * Binary send.
+ */
+Datum
+json_send(PG_FUNCTION_ARGS)
+{
+	StringInfoData buf;
+	text   *t = PG_GETARG_TEXT_PP(0);
+
+	pq_begintypsend(&buf);
+	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+}
+
+/*
+ * Binary receive.
+ */
+Datum
+json_recv(PG_FUNCTION_ARGS)
+{
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	text	   *result;
+	char	   *str;
+	int			nbytes;
+
+	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
+
+	/*
+	 * We need a null-terminated string to pass to json_validate_cstring().
+	 * Rather than make a separate copy, make the temporary result one byte
+	 * bigger than it needs to be.
+	 */
+	result = palloc(nbytes + 1 + VARHDRSZ);
+	SET_VARSIZE(result, nbytes + VARHDRSZ);
+	memcpy(VARDATA(result), str, nbytes);
+	str = VARDATA(result);
+	str[nbytes] = '\0';
+
+	/* Validate it. */
+	json_validate_cstring(str);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+/*
+ * Check whether supplied input is valid JSON.
+ */
+static void
+json_validate_cstring(char *input)
+{
+	JsonLexContext	lex;
+	JsonParseStack *stack,
+				   *stacktop;
+	int				stacksize;
+
+	/* Set up lexing context. */
+	lex.input = input;
+	lex.token_terminator = lex.input;
+	lex.line_number = 1;
+	lex.line_start = input;
+
+	/* Set up parse stack. */
+	stacksize = 32;
+	stacktop = palloc(sizeof(JsonParseStack) * stacksize);
+	stack = stacktop;
+	stack->state = JSON_PARSE_VALUE;
+
+	/* Main parsing loop. */
+	for (;;)
+	{
+		JsonStackOp	op;
+
+		/* Fetch next token. */
+		json_lex(&lex);
+
+		/* Check for unexpected end of input. */
+		if (lex.token_start == NULL)
+			report_parse_error(stack, &lex);
+
+redo:
+		/* Figure out what to do with this token. */
+		op = JSON_STACKOP_NONE;
+		switch (stack->state)
+		{
+			case JSON_PARSE_VALUE:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == '[')
+					stack->state = JSON_PARSE_ARRAY_START;
+				else if (lex.token_start[0] == '{')
+					stack->state = JSON_PARSE_OBJECT_START;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_ARRAY_START:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					stack->state = JSON_PARSE_ARRAY_NEXT;
+				else if (lex.token_start[0] == ']')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == '['
+					|| lex.token_start[0] == '{')
+				{
+					stack->state = JSON_PARSE_ARRAY_NEXT;
+					op = JSON_STACKOP_PUSH_WITH_PUSHBACK;
+				}
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_ARRAY_NEXT:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					report_parse_error(stack, &lex);
+				else if (lex.token_start[0] == ']')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == ',')
+					op = JSON_STACKOP_PUSH;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_START:
+				if (lex.token_type == JSON_VALUE_STRING)
+					stack->state = JSON_PARSE_OBJECT_LABEL;
+				else if (lex.token_type == JSON_VALUE_INVALID
+					&& lex.token_start[0] == '}')
+					op = JSON_STACKOP_POP;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_LABEL:
+				if (lex.token_type == JSON_VALUE_INVALID
+					&& lex.token_start[0] == ':')
+				{
+					stack->state = JSON_PARSE_OBJECT_NEXT;
+					op = JSON_STACKOP_PUSH;
+				}
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_NEXT:
+				if (lex.token_type != JSON_VALUE_INVALID)
+					report_parse_error(stack, &lex);
+				else if (lex.token_start[0] == '}')
+					op = JSON_STACKOP_POP;
+				else if (lex.token_start[0] == ',')
+					stack->state = JSON_PARSE_OBJECT_COMMA;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			case JSON_PARSE_OBJECT_COMMA:
+				if (lex.token_type == JSON_VALUE_STRING)
+					stack->state = JSON_PARSE_OBJECT_LABEL;
+				else
+					report_parse_error(stack, &lex);
+				break;
+			default:
+				elog(ERROR, "unexpected json parse state: %d",
+						(int) stack->state);
+		}
+
+		/* Push or pop the stack, if needed. */
+		switch (op)
+		{
+			case JSON_STACKOP_PUSH:
+			case JSON_STACKOP_PUSH_WITH_PUSHBACK:
+				++stack;
+				if (stack >= &stacktop[stacksize])
+				{
+					int		stackoffset = stack - stacktop;
+					stacksize = stacksize + 32;
+					stacktop = repalloc(stacktop,
+										sizeof(JsonParseStack) * stacksize);
+					stack = stacktop + stackoffset;
+				}
+				stack->state = JSON_PARSE_VALUE;
+				if (op == JSON_STACKOP_PUSH_WITH_PUSHBACK)
+					goto redo;
+				break;
+			case JSON_STACKOP_POP:
+				if (stack == stacktop)
+				{
+					/* Expect end of input. */
+					json_lex(&lex);
+					if (lex.token_start != NULL)
+						report_parse_error(NULL, &lex);
+					return;
+				}
+				--stack;
+				break;
+			case JSON_STACKOP_NONE:
+				/* nothing to do */
+				break;
+		}
+	}
+}
+
+/*
+ * Lex one token from the input stream.
+ */
+static void
+json_lex(JsonLexContext *lex)
+{
+	char	   *s;
+
+	/* Skip leading whitespace. */
+	s = lex->token_terminator;
+	while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r')
+	{
+		if (*s == '\n')
+			++lex->line_number;
+		++s;
+	}
+	lex->token_start = s;
+
+	/* Determine token type. */
+	if (strchr("{}[],:", s[0]))
+	{
+		/* strchr() doesn't return false on a NUL input. */
+		if (s[0] == '\0')
+		{
+			/* End of string. */
+			lex->token_start = NULL;
+			lex->token_terminator = NULL;
+		}
+		else
+		{
+			/* Single-character token, some kind of punctuation mark. */
+			lex->token_terminator = s + 1;
+		}
+		lex->token_type = JSON_VALUE_INVALID;
+	}
+	else if (*s == '"')
+	{
+		/* String. */
+		json_lex_string(lex);
+		lex->token_type = JSON_VALUE_STRING;
+	}
+	else if (*s == '-')
+	{
+		/* Negative number. */
+		json_lex_number(lex, s + 1);
+		lex->token_type = JSON_VALUE_NUMBER;
+	}
+	else if (*s >= '0' && *s <= '9')
+	{
+		/* Positive number. */
+		json_lex_number(lex, s);
+		lex->token_type = JSON_VALUE_NUMBER;
+	}
+	else
+	{
+		char   *p;
+
+		/*
+		 * We're not dealing with a string, number, legal punctuation mark,
+		 * or end of string.  The only legal tokens we might find here are
+		 * true, false, and null, but for error reporting purposes we scan
+		 * until we see a non-alphanumeric character.  That way, we can report
+		 * the whole word as an unexpected token, rather than just some
+		 * unintuitive prefix thereof.
+		 */
+ 		for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')
+			|| (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p);
+			++p)
+			;
+
+		/*
+		 * We got some sort of unexpected punctuation or an otherwise
+		 * unexpected character, so just complain about that one character.
+		 */
+		if (p == s)
+		{
+			lex->token_terminator = s + 1;
+			report_invalid_token(lex);
+		}
+
+		/*
+		 * We've got a real alphanumeric token here.  If it happens to be
+		 * true, false, or null, all is well.  If not, error out.
+		 */
+		lex->token_terminator = p;
+		if (p - s == 4)
+		{
+			if (memcmp(s, "true", 4) == 0)
+				lex->token_type = JSON_VALUE_TRUE;
+			else if (memcmp(s, "null", 4) == 0)
+				lex->token_type = JSON_VALUE_NULL;
+			else
+				report_invalid_token(lex);
+		}
+		else if (p - s == 5 && memcmp(s, "false", 5) == 0)
+			lex->token_type = JSON_VALUE_FALSE;
+		else
+			report_invalid_token(lex);
+	}
+}
+
+/*
+ * The next token in the input stream is known to be a string; lex it.
+ */
+static void
+json_lex_string(JsonLexContext *lex)
+{
+	char	   *s = lex->token_start + 1;
+
+	for (s = lex->token_start + 1; *s != '"'; ++s)
+	{
+		/* Per RFC4627, these characters MUST be escaped. */
+		if (*s < 32)
+		{
+			/* A NUL byte marks the (premature) end of the string. */
+			if (*s == '\0')
+			{
+				lex->token_terminator = s;
+				report_invalid_token(lex);
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type json"),
+					 errdetail_internal("line %d: Character \"%c\" must be escaped.",
+						lex->line_number, *s)));
+		}
+		else if (*s == '\\')
+		{
+			/* OK, we have an escape character. */
+			++s;
+			if (*s == '\0')
+			{
+				lex->token_terminator = s;
+				report_invalid_token(lex);
+			}
+			else if (*s == 'u')
+			{
+				int		i;
+				int		ch = 0;
+
+				for (i = 1; i <= 4; ++i)
+				{
+					if (s[i] == '\0')
+					{
+						lex->token_terminator = s + i;
+						report_invalid_token(lex);
+					}
+					else if (s[i] >= '0' && s[i] <= '9')
+						ch = (ch * 16) + (s[i] - '0');
+					else if (s[i] >= 'a' && s[i] <= 'f')
+						ch = (ch * 16) + (s[i] - 'a') + 10;
+					else if (s[i] >= 'A' && s[i] <= 'F')
+						ch = (ch * 16) + (s[i] - 'A') + 10;
+					else
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								 errmsg("invalid input syntax for type json"),
+								 errdetail_internal("line %d: \"\\u\" must be followed by four hexadecimal digits.",
+									lex->line_number)));
+					}
+				}
+
+				/* Account for the four additional bytes we just parsed. */
+				s += 4;
+			}
+			else if (!strchr("\"\\/bfnrt", *s))
+			{
+				/* Error out. */
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						 errmsg("invalid input syntax for type json"),
+						 errdetail_internal("line %d: Invalid escape \"\\%s\".",
+							lex->line_number, extract_mb_char(s))));
+			}
+		}
+	}
+
+	/* Hooray, we found the end of the string! */
+	lex->token_terminator = s + 1;
+}
+
+/*-------------------------------------------------------------------------
+ * The next token in the input stream is known to be a number; lex it.
+ *
+ * In JSON, a number consists of four parts:
+ *
+ * (1) An optional minus sign ('-').
+ *
+ * (2) Either a single '0', or a string of one or more digits that does not
+ *     begin with a '0'.
+ *
+ * (3) An optional decimal part, consisting of a period ('.') followed by
+ *     one or more digits.  (Note: While this part can be omitted
+ *     completely, it's not OK to have only the decimal point without
+ *     any digits afterwards.)
+ *
+ * (4) An optional exponent part, consisting of 'e' or 'E', optionally
+ *     followed by '+' or '-', followed by one or more digits.  (Note:
+ *     As with the decimal part, if 'e' or 'E' is present, it must be
+ *     followed by at least one digit.)
+ *
+ * The 's' argument to this function points to the ostensible beginning
+ * of part 2 - i.e. the character after any optional minus sign, and the
+ * first character of the string if there is none.
+ *
+ *-------------------------------------------------------------------------
+ */
+static void
+json_lex_number(JsonLexContext *lex, char *s)
+{
+	bool	error = false;
+	char   *p;
+
+	/* Part (1): leading sign indicator. */
+	/* Caller already did this for us; so do nothing. */
+
+	/* Part (2): parse main digit string. */
+	if (*s == '0')
+		++s;
+	else if (*s >= '1' && *s <= '9')
+	{
+		do
+		{
+			++s;
+		} while (*s >= '0' && *s <= '9');
+	}
+	else
+		error = true;
+
+	/* Part (3): parse optional decimal portion. */
+	if (*s == '.')
+	{
+		++s;
+		if (*s < '0' && *s > '9')
+			error = true;
+		else
+		{
+			do
+			{
+				++s;
+			} while (*s >= '0' && *s <= '9');
+		}
+	}
+
+	/* Part (4): parse optional exponent. */
+	if (*s == 'e' || *s == 'E')
+	{
+		++s;
+		if (*s == '+' || *s == '-')
+			++s;
+		if (*s < '0' && *s > '9')
+			error = true;
+		else
+		{
+			do
+			{
+				++s;
+			} while (*s >= '0' && *s <= '9');
+		}
+	}
+
+	/* Check for trailing garbage. */
+	for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z')
+		|| (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); ++p)
+		;
+	lex->token_terminator = p;
+	if (p > s || error)
+		report_invalid_token(lex);
+}
+
+/*
+ * Report a parse error.
+ */
+static void
+report_parse_error(JsonParseStack *stack, JsonLexContext *lex)
+{
+	char   *detail = NULL;
+	char   *token = NULL;
+	int		toklen;
+
+	/* Handle case where the input ended prematurely. */
+	if (lex->token_start == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type json: \"%s\"",
+					lex->input),
+	 			 errdetail_internal("The input string ended unexpectedly.")));
+
+	/* Work out the offending token. */
+	toklen = lex->token_terminator - lex->token_start;
+	token = palloc(toklen + 1);
+	memcpy(token, lex->token_start, toklen);
+	token[toklen] = '\0';
+
+	/* Select correct detail message. */
+	if (stack == NULL)
+		detail = "line %d: Expected end of input, but found \"%s\".";
+	else
+	{
+		switch (stack->state)
+		{
+			case JSON_PARSE_VALUE:
+				detail = "line %d: Expected string, number, object, array, true, false, or null, but found \"%s\".";
+				break;
+			case JSON_PARSE_ARRAY_START:
+				detail = "line %d: Expected array element or \"]\", but found \"%s\".";
+				break;
+			case JSON_PARSE_ARRAY_NEXT:
+				detail = "line %d: Expected \",\" or \"]\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_START:
+				detail = "line %d: Expected string or \"}\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_LABEL:
+				detail = "line %d: Expected \":\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_NEXT:
+				detail = "line %d: Expected \",\" or \"}\", but found \"%s\".";
+				break;
+			case JSON_PARSE_OBJECT_COMMA:
+				detail = "line %d: Expected string, but found \"%s\".";
+				break;
+		}
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("invalid input syntax for type json: \"%s\"",
+				lex->input),
+ 			 errdetail_internal(detail, lex->line_number, token)));
+}
+
+/*
+ * Report an invalid input token.
+ */
+static void
+report_invalid_token(JsonLexContext *lex)
+{
+	char   *token;
+	int		toklen;
+
+	toklen = lex->token_terminator - lex->token_start;
+	token = palloc(toklen + 1);
+	memcpy(token, lex->token_start, toklen);
+	token[toklen] = '\0';
+
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("invalid input syntax for type json"),
+			 errdetail_internal("line %d: Token \"%s\" is invalid.",
+				lex->line_number, token)));
+}
+
+/*
+ * Extract a single, possibly multi-byte char from the input string.
+ */
+static char *
+extract_mb_char(char *s)
+{
+	char   *res;
+	int		len;
+
+	len = pg_mblen(s);
+	res = palloc(len + 1);
+	memcpy(res, s, len);
+	res[len] = '\0';
+
+	return res;
+}
author	Robert Haas <rhaas@postgresql.org>	2012-01-31 11:48:23 -0500
committer	Robert Haas <rhaas@postgresql.org>	2012-01-31 11:48:23 -0500
commit	5384a73f98d9829725186a7b65bf4f8adb3cfaf1 (patch)
tree	7d0906859b26e82bb5d54a5297c072b708f82dbd /src/backend/utils/adt/json.c
parent	4c6cedd1b014abf2046886a9a92e10e18f0d658e (diff)
download	postgresql-5384a73f98d9829725186a7b65bf4f8adb3cfaf1.tar.gz postgresql-5384a73f98d9829725186a7b65bf4f8adb3cfaf1.zip