diff options
Diffstat (limited to 'src/backend/utils/adt/json.c')
-rw-r--r-- | src/backend/utils/adt/json.c | 665 |
1 files changed, 665 insertions, 0 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c new file mode 100644 index 00000000000..cbb81d1bf37 --- /dev/null +++ b/src/backend/utils/adt/json.c @@ -0,0 +1,665 @@ +/*------------------------------------------------------------------------- + * + * json.c + * JSON data type support. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/adt/json.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" +#include "utils/json.h" + +typedef enum +{ + JSON_VALUE_INVALID, + JSON_VALUE_STRING, + JSON_VALUE_NUMBER, + JSON_VALUE_OBJECT, + JSON_VALUE_ARRAY, + JSON_VALUE_TRUE, + JSON_VALUE_FALSE, + JSON_VALUE_NULL +} JsonValueType; + +typedef struct +{ + char *input; + char *token_start; + char *token_terminator; + JsonValueType token_type; + int line_number; + char *line_start; +} JsonLexContext; + +typedef enum +{ + JSON_PARSE_VALUE, /* expecting a value */ + JSON_PARSE_ARRAY_START, /* saw '[', expecting value or ']' */ + JSON_PARSE_ARRAY_NEXT, /* saw array element, expecting ',' or ']' */ + JSON_PARSE_OBJECT_START, /* saw '{', expecting label or '}' */ + JSON_PARSE_OBJECT_LABEL, /* saw object label, expecting ':' */ + JSON_PARSE_OBJECT_NEXT, /* saw object value, expecting ',' or '}' */ + JSON_PARSE_OBJECT_COMMA /* saw object ',', expecting next label */ +} JsonParseState; + +typedef struct JsonParseStack +{ + JsonParseState state; +} JsonParseStack; + +typedef enum +{ + JSON_STACKOP_NONE, + JSON_STACKOP_PUSH, + JSON_STACKOP_PUSH_WITH_PUSHBACK, + JSON_STACKOP_POP +} JsonStackOp; + +static void json_validate_cstring(char *input); +static void json_lex(JsonLexContext *lex); +static void json_lex_string(JsonLexContext *lex); +static void json_lex_number(JsonLexContext *lex, char *s); +static void report_parse_error(JsonParseStack *stack, JsonLexContext *lex); +static void report_invalid_token(JsonLexContext *lex); +static char *extract_mb_char(char *s); + +extern Datum json_in(PG_FUNCTION_ARGS); + +/* + * Input. + */ +Datum +json_in(PG_FUNCTION_ARGS) +{ + char *text = PG_GETARG_CSTRING(0); + + json_validate_cstring(text); + + PG_RETURN_TEXT_P(cstring_to_text(text)); +} + +/* + * Output. + */ +Datum +json_out(PG_FUNCTION_ARGS) +{ + Datum txt = PG_GETARG_DATUM(0); + + PG_RETURN_CSTRING(TextDatumGetCString(txt)); +} + +/* + * Binary send. + */ +Datum +json_send(PG_FUNCTION_ARGS) +{ + StringInfoData buf; + text *t = PG_GETARG_TEXT_PP(0); + + pq_begintypsend(&buf); + pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Binary receive. + */ +Datum +json_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + text *result; + char *str; + int nbytes; + + str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes); + + /* + * We need a null-terminated string to pass to json_validate_cstring(). + * Rather than make a separate copy, make the temporary result one byte + * bigger than it needs to be. + */ + result = palloc(nbytes + 1 + VARHDRSZ); + SET_VARSIZE(result, nbytes + VARHDRSZ); + memcpy(VARDATA(result), str, nbytes); + str = VARDATA(result); + str[nbytes] = '\0'; + + /* Validate it. */ + json_validate_cstring(str); + + PG_RETURN_TEXT_P(result); +} + +/* + * Check whether supplied input is valid JSON. + */ +static void +json_validate_cstring(char *input) +{ + JsonLexContext lex; + JsonParseStack *stack, + *stacktop; + int stacksize; + + /* Set up lexing context. */ + lex.input = input; + lex.token_terminator = lex.input; + lex.line_number = 1; + lex.line_start = input; + + /* Set up parse stack. */ + stacksize = 32; + stacktop = palloc(sizeof(JsonParseStack) * stacksize); + stack = stacktop; + stack->state = JSON_PARSE_VALUE; + + /* Main parsing loop. */ + for (;;) + { + JsonStackOp op; + + /* Fetch next token. */ + json_lex(&lex); + + /* Check for unexpected end of input. */ + if (lex.token_start == NULL) + report_parse_error(stack, &lex); + +redo: + /* Figure out what to do with this token. */ + op = JSON_STACKOP_NONE; + switch (stack->state) + { + case JSON_PARSE_VALUE: + if (lex.token_type != JSON_VALUE_INVALID) + op = JSON_STACKOP_POP; + else if (lex.token_start[0] == '[') + stack->state = JSON_PARSE_ARRAY_START; + else if (lex.token_start[0] == '{') + stack->state = JSON_PARSE_OBJECT_START; + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_ARRAY_START: + if (lex.token_type != JSON_VALUE_INVALID) + stack->state = JSON_PARSE_ARRAY_NEXT; + else if (lex.token_start[0] == ']') + op = JSON_STACKOP_POP; + else if (lex.token_start[0] == '[' + || lex.token_start[0] == '{') + { + stack->state = JSON_PARSE_ARRAY_NEXT; + op = JSON_STACKOP_PUSH_WITH_PUSHBACK; + } + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_ARRAY_NEXT: + if (lex.token_type != JSON_VALUE_INVALID) + report_parse_error(stack, &lex); + else if (lex.token_start[0] == ']') + op = JSON_STACKOP_POP; + else if (lex.token_start[0] == ',') + op = JSON_STACKOP_PUSH; + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_OBJECT_START: + if (lex.token_type == JSON_VALUE_STRING) + stack->state = JSON_PARSE_OBJECT_LABEL; + else if (lex.token_type == JSON_VALUE_INVALID + && lex.token_start[0] == '}') + op = JSON_STACKOP_POP; + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_OBJECT_LABEL: + if (lex.token_type == JSON_VALUE_INVALID + && lex.token_start[0] == ':') + { + stack->state = JSON_PARSE_OBJECT_NEXT; + op = JSON_STACKOP_PUSH; + } + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_OBJECT_NEXT: + if (lex.token_type != JSON_VALUE_INVALID) + report_parse_error(stack, &lex); + else if (lex.token_start[0] == '}') + op = JSON_STACKOP_POP; + else if (lex.token_start[0] == ',') + stack->state = JSON_PARSE_OBJECT_COMMA; + else + report_parse_error(stack, &lex); + break; + case JSON_PARSE_OBJECT_COMMA: + if (lex.token_type == JSON_VALUE_STRING) + stack->state = JSON_PARSE_OBJECT_LABEL; + else + report_parse_error(stack, &lex); + break; + default: + elog(ERROR, "unexpected json parse state: %d", + (int) stack->state); + } + + /* Push or pop the stack, if needed. */ + switch (op) + { + case JSON_STACKOP_PUSH: + case JSON_STACKOP_PUSH_WITH_PUSHBACK: + ++stack; + if (stack >= &stacktop[stacksize]) + { + int stackoffset = stack - stacktop; + stacksize = stacksize + 32; + stacktop = repalloc(stacktop, + sizeof(JsonParseStack) * stacksize); + stack = stacktop + stackoffset; + } + stack->state = JSON_PARSE_VALUE; + if (op == JSON_STACKOP_PUSH_WITH_PUSHBACK) + goto redo; + break; + case JSON_STACKOP_POP: + if (stack == stacktop) + { + /* Expect end of input. */ + json_lex(&lex); + if (lex.token_start != NULL) + report_parse_error(NULL, &lex); + return; + } + --stack; + break; + case JSON_STACKOP_NONE: + /* nothing to do */ + break; + } + } +} + +/* + * Lex one token from the input stream. + */ +static void +json_lex(JsonLexContext *lex) +{ + char *s; + + /* Skip leading whitespace. */ + s = lex->token_terminator; + while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r') + { + if (*s == '\n') + ++lex->line_number; + ++s; + } + lex->token_start = s; + + /* Determine token type. */ + if (strchr("{}[],:", s[0])) + { + /* strchr() doesn't return false on a NUL input. */ + if (s[0] == '\0') + { + /* End of string. */ + lex->token_start = NULL; + lex->token_terminator = NULL; + } + else + { + /* Single-character token, some kind of punctuation mark. */ + lex->token_terminator = s + 1; + } + lex->token_type = JSON_VALUE_INVALID; + } + else if (*s == '"') + { + /* String. */ + json_lex_string(lex); + lex->token_type = JSON_VALUE_STRING; + } + else if (*s == '-') + { + /* Negative number. */ + json_lex_number(lex, s + 1); + lex->token_type = JSON_VALUE_NUMBER; + } + else if (*s >= '0' && *s <= '9') + { + /* Positive number. */ + json_lex_number(lex, s); + lex->token_type = JSON_VALUE_NUMBER; + } + else + { + char *p; + + /* + * We're not dealing with a string, number, legal punctuation mark, + * or end of string. The only legal tokens we might find here are + * true, false, and null, but for error reporting purposes we scan + * until we see a non-alphanumeric character. That way, we can report + * the whole word as an unexpected token, rather than just some + * unintuitive prefix thereof. + */ + for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') + || (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); + ++p) + ; + + /* + * We got some sort of unexpected punctuation or an otherwise + * unexpected character, so just complain about that one character. + */ + if (p == s) + { + lex->token_terminator = s + 1; + report_invalid_token(lex); + } + + /* + * We've got a real alphanumeric token here. If it happens to be + * true, false, or null, all is well. If not, error out. + */ + lex->token_terminator = p; + if (p - s == 4) + { + if (memcmp(s, "true", 4) == 0) + lex->token_type = JSON_VALUE_TRUE; + else if (memcmp(s, "null", 4) == 0) + lex->token_type = JSON_VALUE_NULL; + else + report_invalid_token(lex); + } + else if (p - s == 5 && memcmp(s, "false", 5) == 0) + lex->token_type = JSON_VALUE_FALSE; + else + report_invalid_token(lex); + } +} + +/* + * The next token in the input stream is known to be a string; lex it. + */ +static void +json_lex_string(JsonLexContext *lex) +{ + char *s = lex->token_start + 1; + + for (s = lex->token_start + 1; *s != '"'; ++s) + { + /* Per RFC4627, these characters MUST be escaped. */ + if (*s < 32) + { + /* A NUL byte marks the (premature) end of the string. */ + if (*s == '\0') + { + lex->token_terminator = s; + report_invalid_token(lex); + } + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail_internal("line %d: Character \"%c\" must be escaped.", + lex->line_number, *s))); + } + else if (*s == '\\') + { + /* OK, we have an escape character. */ + ++s; + if (*s == '\0') + { + lex->token_terminator = s; + report_invalid_token(lex); + } + else if (*s == 'u') + { + int i; + int ch = 0; + + for (i = 1; i <= 4; ++i) + { + if (s[i] == '\0') + { + lex->token_terminator = s + i; + report_invalid_token(lex); + } + else if (s[i] >= '0' && s[i] <= '9') + ch = (ch * 16) + (s[i] - '0'); + else if (s[i] >= 'a' && s[i] <= 'f') + ch = (ch * 16) + (s[i] - 'a') + 10; + else if (s[i] >= 'A' && s[i] <= 'F') + ch = (ch * 16) + (s[i] - 'A') + 10; + else + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail_internal("line %d: \"\\u\" must be followed by four hexadecimal digits.", + lex->line_number))); + } + } + + /* Account for the four additional bytes we just parsed. */ + s += 4; + } + else if (!strchr("\"\\/bfnrt", *s)) + { + /* Error out. */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail_internal("line %d: Invalid escape \"\\%s\".", + lex->line_number, extract_mb_char(s)))); + } + } + } + + /* Hooray, we found the end of the string! */ + lex->token_terminator = s + 1; +} + +/*------------------------------------------------------------------------- + * The next token in the input stream is known to be a number; lex it. + * + * In JSON, a number consists of four parts: + * + * (1) An optional minus sign ('-'). + * + * (2) Either a single '0', or a string of one or more digits that does not + * begin with a '0'. + * + * (3) An optional decimal part, consisting of a period ('.') followed by + * one or more digits. (Note: While this part can be omitted + * completely, it's not OK to have only the decimal point without + * any digits afterwards.) + * + * (4) An optional exponent part, consisting of 'e' or 'E', optionally + * followed by '+' or '-', followed by one or more digits. (Note: + * As with the decimal part, if 'e' or 'E' is present, it must be + * followed by at least one digit.) + * + * The 's' argument to this function points to the ostensible beginning + * of part 2 - i.e. the character after any optional minus sign, and the + * first character of the string if there is none. + * + *------------------------------------------------------------------------- + */ +static void +json_lex_number(JsonLexContext *lex, char *s) +{ + bool error = false; + char *p; + + /* Part (1): leading sign indicator. */ + /* Caller already did this for us; so do nothing. */ + + /* Part (2): parse main digit string. */ + if (*s == '0') + ++s; + else if (*s >= '1' && *s <= '9') + { + do + { + ++s; + } while (*s >= '0' && *s <= '9'); + } + else + error = true; + + /* Part (3): parse optional decimal portion. */ + if (*s == '.') + { + ++s; + if (*s < '0' && *s > '9') + error = true; + else + { + do + { + ++s; + } while (*s >= '0' && *s <= '9'); + } + } + + /* Part (4): parse optional exponent. */ + if (*s == 'e' || *s == 'E') + { + ++s; + if (*s == '+' || *s == '-') + ++s; + if (*s < '0' && *s > '9') + error = true; + else + { + do + { + ++s; + } while (*s >= '0' && *s <= '9'); + } + } + + /* Check for trailing garbage. */ + for (p = s; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') + || (*p >= '0' && *p <= '9') || *p == '_' || IS_HIGHBIT_SET(*p); ++p) + ; + lex->token_terminator = p; + if (p > s || error) + report_invalid_token(lex); +} + +/* + * Report a parse error. + */ +static void +report_parse_error(JsonParseStack *stack, JsonLexContext *lex) +{ + char *detail = NULL; + char *token = NULL; + int toklen; + + /* Handle case where the input ended prematurely. */ + if (lex->token_start == NULL) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json: \"%s\"", + lex->input), + errdetail_internal("The input string ended unexpectedly."))); + + /* Work out the offending token. */ + toklen = lex->token_terminator - lex->token_start; + token = palloc(toklen + 1); + memcpy(token, lex->token_start, toklen); + token[toklen] = '\0'; + + /* Select correct detail message. */ + if (stack == NULL) + detail = "line %d: Expected end of input, but found \"%s\"."; + else + { + switch (stack->state) + { + case JSON_PARSE_VALUE: + detail = "line %d: Expected string, number, object, array, true, false, or null, but found \"%s\"."; + break; + case JSON_PARSE_ARRAY_START: + detail = "line %d: Expected array element or \"]\", but found \"%s\"."; + break; + case JSON_PARSE_ARRAY_NEXT: + detail = "line %d: Expected \",\" or \"]\", but found \"%s\"."; + break; + case JSON_PARSE_OBJECT_START: + detail = "line %d: Expected string or \"}\", but found \"%s\"."; + break; + case JSON_PARSE_OBJECT_LABEL: + detail = "line %d: Expected \":\", but found \"%s\"."; + break; + case JSON_PARSE_OBJECT_NEXT: + detail = "line %d: Expected \",\" or \"}\", but found \"%s\"."; + break; + case JSON_PARSE_OBJECT_COMMA: + detail = "line %d: Expected string, but found \"%s\"."; + break; + } + } + + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json: \"%s\"", + lex->input), + errdetail_internal(detail, lex->line_number, token))); +} + +/* + * Report an invalid input token. + */ +static void +report_invalid_token(JsonLexContext *lex) +{ + char *token; + int toklen; + + toklen = lex->token_terminator - lex->token_start; + token = palloc(toklen + 1); + memcpy(token, lex->token_start, toklen); + token[toklen] = '\0'; + + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail_internal("line %d: Token \"%s\" is invalid.", + lex->line_number, token))); +} + +/* + * Extract a single, possibly multi-byte char from the input string. + */ +static char * +extract_mb_char(char *s) +{ + char *res; + int len; + + len = pg_mblen(s); + res = palloc(len + 1); + memcpy(res, s, len); + res[len] = '\0'; + + return res; +} |