/* * Copyright (C) Igor Sysoev * Copyright (C) NGINX, Inc. */ #ifndef _NJS_STRING_H_INCLUDED_ #define _NJS_STRING_H_INCLUDED_ /* * nJSVM supports two string variants: * * 1) short strings which size is less than or equal to 14 (NJS_STRING_SHORT) * bytes, these strings are stored inside njs_value_t (see njs_vm.h for * details); * * 2) and long strings using additional njs_string_t structure. * This structure has the start field to support external strings. * The long strings can have optional UTF-8 offset map. * * The number of the string variants is limited to 2 variants to minimize * overhead of processing string fields. */ /* The maximum signed int32_t. */ #define NJS_STRING_MAX_LENGTH 0x7fffffff /* * NJS_STRING_MAP_STRIDE should be power of two to use shift and binary * AND operations instead of division and remainder operations but no * less than 16 because the maximum length of short string inlined in * njs_value_t is less than 16 bytes. */ #define NJS_STRING_MAP_STRIDE 32 #define njs_string_map_offset(size) njs_align_size((size), sizeof(uint32_t)) #define njs_string_map_start(p) \ ((uint32_t *) njs_align_ptr((p), sizeof(uint32_t))) #define njs_string_map_size(length) \ (((length - 1) / NJS_STRING_MAP_STRIDE) * sizeof(uint32_t)) /* * ECMAScript strings are stored in UTF-16. nJSVM however, allows to store * any byte sequences in strings. A size of string in bytes is stored in the * size field. If byte sequence is valid UTF-8 string then its length is * stored in the UTF-8 length field. Otherwise, the length field is zero. * If a string is UTF-8 string then string functions use UTF-8 characters * positions and lengths. Otherwise they use with byte positions and lengths. * Using UTF-8 encoding does not allow to get quickly a character at specified * position. To speed up this search a map of offsets is stored after the * UTF-8 string. The map is aligned to uint32_t and contains byte positions * of each NJS_STRING_MAP_STRIDE UTF-8 character except zero position. The * map can be initialized on demand. Unitialized map is marked with zero * value in the first map element. If string comes outside JavaScript as * byte string just to be concatenated or to match regular expressions the * offset map is not required. * * The map is not allocated: * 1) if string length is zero hence string is a byte string; * 2) if string size and length are equal so the string contains only * ASCII characters and map is not required; * 3) if string length is less than NJS_STRING_MAP_STRIDE. * * The current implementation does not support Unicode surrogate pairs. * It can be implemented later if it will be required using the following * algorithm: if offset in map points to surrogate pair then the previous * offset should be used and so on until start of the string. */ struct njs_string_s { u_char *start; uint32_t length; /* Length in UTF-8 characters. */ uint32_t size; }; typedef struct { size_t size; size_t length; u_char *start; } njs_string_prop_t; typedef struct { size_t start; size_t length; size_t string_length; } njs_slice_prop_t; typedef enum { NJS_STRING_ASCII = 0, NJS_STRING_UTF8, } njs_utf8_t; typedef enum { NJS_TRIM_START = 1, NJS_TRIM_END = 2, } njs_trim_t; u_char *njs_string_alloc(njs_vm_t *vm, njs_value_t *value, uint64_t size, uint64_t length); njs_int_t njs_string_new(njs_vm_t *vm, njs_value_t *value, const u_char *start, uint32_t size, uint32_t length); njs_int_t njs_string_create(njs_vm_t *vm, njs_value_t *value, const u_char *src, size_t size); njs_int_t njs_string_create_chb(njs_vm_t *vm, njs_value_t *value, njs_chb_t *chain); size_t njs_string_prop(njs_vm_t *vm, njs_string_prop_t *string, const njs_value_t *value); void njs_encode_hex(njs_str_t *dst, const njs_str_t *src); size_t njs_encode_hex_length(const njs_str_t *src, size_t *out_size); void njs_encode_base64(njs_str_t *dst, const njs_str_t *src); size_t njs_encode_base64_length(const njs_str_t *src, size_t *out_size); void njs_decode_utf8(njs_str_t *dst, const njs_str_t *src); size_t njs_decode_utf8_length(const njs_str_t *src, size_t *out_size); void njs_decode_hex(njs_str_t *dst, const njs_str_t *src); size_t njs_decode_hex_length(const njs_str_t *src, size_t *out_size); void njs_decode_base64(njs_str_t *dst, const njs_str_t *src); size_t njs_decode_base64_length(const njs_str_t *src, size_t *out_size); void njs_decode_base64url(njs_str_t *dst, const njs_str_t *src); size_t njs_decode_base64url_length(const njs_str_t *src, size_t *out_size); njs_int_t njs_string_hex(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_base64(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_base64url(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_decode_utf8(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_decode_hex(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_decode_base64(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); njs_int_t njs_string_decode_base64url(njs_vm_t *vm, njs_value_t *value, const njs_str_t *src); void njs_string_truncate(njs_value_t *value, uint32_t size, uint32_t length); uint32_t njs_string_trim(njs_vm_t *vm, const njs_value_t *value, njs_string_prop_t *string, unsigned mode); void njs_string_copy(njs_value_t *dst, njs_value_t *src); njs_int_t njs_string_cmp(njs_vm_t *vm, const njs_value_t *val1, const njs_value_t *val2); void njs_string_slice_string_prop(njs_string_prop_t *dst, const njs_string_prop_t *string, const njs_slice_prop_t *slice); njs_int_t njs_string_slice(njs_vm_t *vm, njs_value_t *dst, const njs_string_prop_t *string, const njs_slice_prop_t *slice); const u_char *njs_string_utf8_offset(const u_char *start, const u_char *end, size_t index); uint32_t njs_string_index(njs_string_prop_t *string, uint32_t offset); void njs_string_utf8_offset_map_init(const u_char *start, size_t size); double njs_string_to_index(const njs_value_t *value); njs_int_t njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component, njs_value_t *retval); njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t component, njs_value_t *retval); njs_int_t njs_string_btoa(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused, njs_value_t *retval); njs_int_t njs_string_atob(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused, njs_value_t *retval); njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args, njs_uint_t nargs, njs_index_t unused, njs_value_t *retval); njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched, njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures, njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval); njs_inline njs_int_t njs_atom_string_create(njs_vm_t *vm, njs_value_t *value, const u_char *src, size_t size) { njs_int_t ret; ret = njs_string_create(vm, value, src, size); if (njs_slow_path(ret != NJS_OK)) { return ret; } return njs_atom_atomize_key(vm, value); } njs_inline njs_bool_t njs_is_ascii_string(njs_string_prop_t *string) { return string->length == string->size; } njs_inline uint32_t njs_string_calc_length(njs_utf8_t utf8, const u_char *start, size_t size) { ssize_t length; switch (utf8) { case NJS_STRING_ASCII: return size; case NJS_STRING_UTF8: default: length = njs_utf8_length(start, size); return length; } } njs_inline njs_bool_t njs_need_escape(const uint32_t *escape, uint32_t byte) { return ((escape[byte >> 5] & ((uint32_t) 1 << (byte & 0x1f))) != 0); } njs_inline u_char * njs_string_encode(const uint32_t *escape, size_t size, const u_char *src, u_char *dst) { uint8_t byte; static const u_char hex[] = "0123456789ABCDEF"; do { byte = *src++; if (njs_need_escape(escape, byte)) { *dst++ = '%'; *dst++ = hex[byte >> 4]; *dst++ = hex[byte & 0xf]; } else { *dst++ = byte; } size--; } while (size != 0); return dst; } njs_inline const u_char * njs_string_offset(njs_string_prop_t *string, int64_t index) { if (njs_is_ascii_string(string)) { return string->start + index; } /* UTF-8 string. */ if (index == (int64_t) string->length) { return string->start + string->size; } return njs_string_utf8_offset(string->start, string->start + string->size, index); } extern const njs_object_init_t njs_string_instance_init; extern const njs_object_type_init_t njs_string_type_init; #endif /* _NJS_STRING_H_INCLUDED_ */