diff options
author | bellard <6490144+bellard@users.noreply.github.com> | 2020-09-06 18:53:08 +0200 |
---|---|---|
committer | bellard <6490144+bellard@users.noreply.github.com> | 2020-09-06 18:53:08 +0200 |
commit | 91459fb6723e29e923380cec0023af93819ae69d (patch) | |
tree | 6a1aff8d9b290ed184d1481da50d0e6b4a9a324c /unicode_gen.c | |
parent | 9096e544ba2357eeadc6f09fc6e5cf58db7751bc (diff) | |
download | quickjs-91459fb6723e29e923380cec0023af93819ae69d.tar.gz quickjs-91459fb6723e29e923380cec0023af93819ae69d.zip |
2020-01-05 release
Diffstat (limited to 'unicode_gen.c')
-rw-r--r-- | unicode_gen.c | 3057 |
1 files changed, 3057 insertions, 0 deletions
diff --git a/unicode_gen.c b/unicode_gen.c new file mode 100644 index 0000000..f18aaa0 --- /dev/null +++ b/unicode_gen.c @@ -0,0 +1,3057 @@ +/* + * Generation of Unicode tables + * + * Copyright (c) 2017-2018 Fabrice Bellard + * Copyright (c) 2017-2018 Charlie Gordon + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <inttypes.h> +#include <string.h> +#include <assert.h> +#include <ctype.h> +#include <time.h> + +#include "cutils.h" + +/* define it to be able to test unicode.c */ +//#define USE_TEST +/* profile tests */ +//#define PROFILE + +//#define DUMP_CASE_CONV_TABLE +//#define DUMP_TABLE_SIZE +//#define DUMP_CC_TABLE +//#define DUMP_DECOMP_TABLE + +/* Ideas: + - Generalize run length encoding + index for all tables + - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased + + Case conversion: + - use a single entry for consecutive U/LF runs + - allow EXT runs of length > 1 + + Decomposition: + - Greek lower case (+1f10/1f10) ? + - allow holes in B runs + - suppress more upper / lower case redundancy +*/ + +#ifdef USE_TEST +#include "libunicode.c" +#endif + +#define CHARCODE_MAX 0x10ffff +#define CC_LEN_MAX 3 + +void *mallocz(size_t size) +{ + void *ptr; + ptr = malloc(size); + memset(ptr, 0, size); + return ptr; +} + +const char *get_field(const char *p, int n) +{ + int i; + for(i = 0; i < n; i++) { + while (*p != ';' && *p != '\0') + p++; + if (*p == '\0') + return NULL; + p++; + } + return p; +} + +const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n) +{ + char *q; + p = get_field(p, n); + q = buf; + while (*p != ';' && *p != '\0') { + if ((q - buf) < buf_size - 1) + *q++ = *p; + p++; + } + *q = '\0'; + return buf; +} + +void add_char(int **pbuf, int *psize, int *plen, int c) +{ + int len, size, *buf; + buf = *pbuf; + size = *psize; + len = *plen; + if (len >= size) { + size = *psize; + size = max_int(len + 1, size * 3 / 2); + buf = realloc(buf, sizeof(buf[0]) * size); + *pbuf = buf; + *psize = size; + } + buf[len++] = c; + *plen = len; +} + +int *get_field_str(int *plen, const char *str, int n) +{ + const char *p; + int *buf, len, size; + p = get_field(str, n); + if (!p) { + *plen = 0; + return NULL; + } + len = 0; + size = 0; + buf = NULL; + for(;;) { + while (isspace(*p)) + p++; + if (!isxdigit(*p)) + break; + add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16)); + } + *plen = len; + return buf; +} + +char *get_line(char *buf, int buf_size, FILE *f) +{ + int len; + if (!fgets(buf, buf_size, f)) + return NULL; + len = strlen(buf); + if (len > 0 && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + return buf; +} + +#define UNICODE_GENERAL_CATEGORY + +typedef enum { +#define DEF(id, str) GCAT_ ## id, +#include "unicode_gen_def.h" +#undef DEF + GCAT_COUNT, +} UnicodeGCEnum1; + +static const char *unicode_gc_name[] = { +#define DEF(id, str) #id, +#include "unicode_gen_def.h" +#undef DEF +}; + +static const char *unicode_gc_short_name[] = { +#define DEF(id, str) str, +#include "unicode_gen_def.h" +#undef DEF +}; + +#undef UNICODE_GENERAL_CATEGORY + +#define UNICODE_SCRIPT + +typedef enum { +#define DEF(id, str) SCRIPT_ ## id, +#include "unicode_gen_def.h" +#undef DEF + SCRIPT_COUNT, +} UnicodeScriptEnum1; + +static const char *unicode_script_name[] = { +#define DEF(id, str) #id, +#include "unicode_gen_def.h" +#undef DEF +}; + +const char *unicode_script_short_name[] = { +#define DEF(id, str) str, +#include "unicode_gen_def.h" +#undef DEF +}; + +#undef UNICODE_SCRIPT + +#define UNICODE_PROP_LIST + +typedef enum { +#define DEF(id, str) PROP_ ## id, +#include "unicode_gen_def.h" +#undef DEF + PROP_COUNT, +} UnicodePropEnum1; + +static const char *unicode_prop_name[] = { +#define DEF(id, str) #id, +#include "unicode_gen_def.h" +#undef DEF +}; + +static const char *unicode_prop_short_name[] = { +#define DEF(id, str) str, +#include "unicode_gen_def.h" +#undef DEF +}; + +#undef UNICODE_SPROP_LIST + +typedef struct { + /* case conv */ + uint8_t u_len; + uint8_t l_len; + int u_data[CC_LEN_MAX]; + int l_data[CC_LEN_MAX]; + int f_code; + + uint8_t combining_class; + uint8_t is_compat:1; + uint8_t is_excluded:1; + uint8_t general_category; + uint8_t script; + uint8_t script_ext_len; + uint8_t *script_ext; + uint32_t prop_bitmap_tab[3]; + /* decomposition */ + int decomp_len; + int *decomp_data; +} CCInfo; + +CCInfo *unicode_db; + +int find_name(const char **tab, int tab_len, const char *name) +{ + int i, len, name_len; + const char *p, *r; + + name_len = strlen(name); + for(i = 0; i < tab_len; i++) { + p = tab[i]; + for(;;) { + r = strchr(p, ','); + if (!r) + len = strlen(p); + else + len = r - p; + if (len == name_len && memcmp(p, name, len) == 0) + return i; + if (!r) + break; + p = r + 1; + } + } + return -1; +} + +static int get_prop(uint32_t c, int prop_idx) +{ + return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1; +} + +static void set_prop(uint32_t c, int prop_idx, int val) +{ + uint32_t mask; + mask = 1U << (prop_idx & 0x1f); + if (val) + unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask; + else + unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask; +} + +void parse_unicode_data(const char *filename) +{ + FILE *f; + char line[1024]; + char buf1[256]; + const char *p; + int code, lc, uc, last_code; + CCInfo *ci, *tab = unicode_db; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + last_code = 0; + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#') + continue; + + p = get_field(line, 0); + if (!p) + continue; + code = strtoul(p, NULL, 16); + lc = 0; + uc = 0; + + p = get_field(line, 12); + if (p && *p != ';') { + uc = strtoul(p, NULL, 16); + } + + p = get_field(line, 13); + if (p && *p != ';') { + lc = strtoul(p, NULL, 16); + } + ci = &tab[code]; + if (uc > 0 || lc > 0) { + assert(code <= CHARCODE_MAX); + if (uc > 0) { + assert(ci->u_len == 0); + ci->u_len = 1; + ci->u_data[0] = uc; + } + if (lc > 0) { + assert(ci->l_len == 0); + ci->l_len = 1; + ci->l_data[0] = lc; + } + } + + { + int i; + get_field_buf(buf1, sizeof(buf1), line, 2); + i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1); + if (i < 0) { + fprintf(stderr, "General category '%s' not found\n", + buf1); + exit(1); + } + ci->general_category = i; + } + + p = get_field(line, 3); + if (p && *p != ';' && *p != '\0') { + int cc; + cc = strtoul(p, NULL, 0); + if (cc != 0) { + assert(code <= CHARCODE_MAX); + ci->combining_class = cc; + // printf("%05x: %d\n", code, ci->combining_class); + } + } + + p = get_field(line, 5); + if (p && *p != ';' && *p != '\0') { + int size; + assert(code <= CHARCODE_MAX); + ci->is_compat = 0; + if (*p == '<') { + while (*p != '\0' && *p != '>') + p++; + if (*p == '>') + p++; + ci->is_compat = 1; + } + size = 0; + for(;;) { + while (isspace(*p)) + p++; + if (!isxdigit(*p)) + break; + add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16)); + } +#if 0 + { + int i; + static int count, d_count; + + printf("%05x: %c", code, ci->is_compat ? 'C': ' '); + for(i = 0; i < ci->decomp_len; i++) + printf(" %05x", ci->decomp_data[i]); + printf("\n"); + count++; + d_count += ci->decomp_len; + // printf("%d %d\n", count, d_count); + } +#endif + } + + p = get_field(line, 9); + if (p && *p == 'Y') { + set_prop(code, PROP_Bidi_Mirrored, 1); + } + + /* handle ranges */ + get_field_buf(buf1, sizeof(buf1), line, 1); + if (strstr(buf1, " Last>")) { + int i; + // printf("range: 0x%x-%0x\n", last_code, code); + assert(ci->decomp_len == 0); + assert(ci->script_ext_len == 0); + for(i = last_code + 1; i < code; i++) { + unicode_db[i] = *ci; + } + } + last_code = code; + } + + fclose(f); +} + +void parse_special_casing(CCInfo *tab, const char *filename) +{ + FILE *f; + char line[1024]; + const char *p; + int code; + CCInfo *ci; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#') + continue; + + p = get_field(line, 0); + if (!p) + continue; + code = strtoul(p, NULL, 16); + assert(code <= CHARCODE_MAX); + ci = &tab[code]; + + p = get_field(line, 4); + if (p) { + /* locale dependent casing */ + while (isspace(*p)) + p++; + if (*p != '#' && *p != '\0') + continue; + } + + + p = get_field(line, 1); + if (p && *p != ';') { + ci->l_len = 0; + for(;;) { + while (isspace(*p)) + p++; + if (*p == ';') + break; + assert(ci->l_len < CC_LEN_MAX); + ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16); + } + + if (ci->l_len == 1 && ci->l_data[0] == code) + ci->l_len = 0; + } + + p = get_field(line, 3); + if (p && *p != ';') { + ci->u_len = 0; + for(;;) { + while (isspace(*p)) + p++; + if (*p == ';') + break; + assert(ci->u_len < CC_LEN_MAX); + ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16); + } + + if (ci->u_len == 1 && ci->u_data[0] == code) + ci->u_len = 0; + } + } + + fclose(f); +} + +void parse_case_folding(CCInfo *tab, const char *filename) +{ + FILE *f; + char line[1024]; + const char *p; + int code; + CCInfo *ci; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#') + continue; + + p = get_field(line, 0); + if (!p) + continue; + code = strtoul(p, NULL, 16); + assert(code <= CHARCODE_MAX); + ci = &tab[code]; + + p = get_field(line, 1); + if (!p) + continue; + /* locale dependent casing */ + while (isspace(*p)) + p++; + if (*p != 'C' && *p != 'S') + continue; + + p = get_field(line, 2); + assert(p != 0); + assert(ci->f_code == 0); + ci->f_code = strtoul(p, NULL, 16); + assert(ci->f_code != 0 && ci->f_code != code); + } + + fclose(f); +} + +void parse_composition_exclusions(const char *filename) +{ + FILE *f; + char line[4096], *p; + uint32_t c0; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + assert(c0 > 0 && c0 <= CHARCODE_MAX); + unicode_db[c0].is_excluded = TRUE; + } + fclose(f); +} + +void parse_derived_core_properties(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q; + uint32_t c0, c1, c; + int i; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + } else { + c1 = c0; + } + assert(c1 <= CHARCODE_MAX); + p += strspn(p, " \t"); + if (*p == ';') { + p++; + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + i = find_name(unicode_prop_name, + countof(unicode_prop_name), buf); + if (i < 0) { + if (!strcmp(buf, "Grapheme_Link")) + goto next; + fprintf(stderr, "Property not found: %s\n", buf); + exit(1); + } + for(c = c0; c <= c1; c++) { + set_prop(c, i, 1); + } +next: ; + } + } + fclose(f); +} + +void parse_derived_norm_properties(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q; + uint32_t c0, c1, c; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + } else { + c1 = c0; + } + assert(c1 <= CHARCODE_MAX); + p += strspn(p, " \t"); + if (*p == ';') { + p++; + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) { + for(c = c0; c <= c1; c++) { + set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1); + } + } + } + } + fclose(f); +} + +void parse_prop_list(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q; + uint32_t c0, c1, c; + int i; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + } else { + c1 = c0; + } + assert(c1 <= CHARCODE_MAX); + p += strspn(p, " \t"); + if (*p == ';') { + p++; + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + i = find_name(unicode_prop_name, + countof(unicode_prop_name), buf); + if (i < 0) { + fprintf(stderr, "Property not found: %s\n", buf); + exit(1); + } + for(c = c0; c <= c1; c++) { + set_prop(c, i, 1); + } + } + } + fclose(f); +} + +void parse_scripts(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q; + uint32_t c0, c1, c; + int i; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + } else { + c1 = c0; + } + assert(c1 <= CHARCODE_MAX); + p += strspn(p, " \t"); + if (*p == ';') { + p++; + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + i = find_name(unicode_script_name, + countof(unicode_script_name), buf); + if (i < 0) { + fprintf(stderr, "Unknown script: '%s'\n", buf); + exit(1); + } + for(c = c0; c <= c1; c++) + unicode_db[c].script = i; + } + } + fclose(f); +} + +void parse_script_extensions(const char *filename) +{ + FILE *f; + char line[4096], *p, buf[256], *q; + uint32_t c0, c1, c; + int i; + uint8_t script_ext[255]; + int script_ext_len; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@' || *p == '\0') + continue; + c0 = strtoul(p, (char **)&p, 16); + if (*p == '.' && p[1] == '.') { + p += 2; + c1 = strtoul(p, (char **)&p, 16); + } else { + c1 = c0; + } + assert(c1 <= CHARCODE_MAX); + p += strspn(p, " \t"); + script_ext_len = 0; + if (*p == ';') { + p++; + for(;;) { + p += strspn(p, " \t"); + q = buf; + while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') { + if ((q - buf) < sizeof(buf) - 1) + *q++ = *p; + p++; + } + *q = '\0'; + if (buf[0] == '\0') + break; + i = find_name(unicode_script_short_name, + countof(unicode_script_short_name), buf); + if (i < 0) { + fprintf(stderr, "Script not found: %s\n", buf); + exit(1); + } + assert(script_ext_len < sizeof(script_ext)); + script_ext[script_ext_len++] = i; + } + for(c = c0; c <= c1; c++) { + CCInfo *ci = &unicode_db[c]; + ci->script_ext_len = script_ext_len; + ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len); + for(i = 0; i < script_ext_len; i++) + ci->script_ext[i] = script_ext[i]; + } + } + } + fclose(f); +} + +void dump_cc_info(CCInfo *ci, int i) +{ + int j; + printf("%05x:", i); + if (ci->u_len != 0) { + printf(" U:"); + for(j = 0; j < ci->u_len; j++) + printf(" %05x", ci->u_data[j]); + } + if (ci->l_len != 0) { + printf(" L:"); + for(j = 0; j < ci->l_len; j++) + printf(" %05x", ci->l_data[j]); + } + if (ci->f_code != 0) { + printf(" F: %05x", ci->f_code); + } + printf("\n"); +} + +void dump_data(CCInfo *tab) +{ + int i; + CCInfo *ci; + for(i = 0; i <= CHARCODE_MAX; i++) { + ci = &tab[i]; + if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) { + dump_cc_info(ci, i); + } + } +} + +BOOL is_complicated_case(const CCInfo *ci) +{ + return (ci->u_len > 1 || ci->l_len > 1 || + (ci->u_len > 0 && ci->l_len > 0) || + (ci->f_code != 0) != ci->l_len || + (ci->f_code != 0 && ci->l_data[0] != ci->f_code)); +} + +#ifndef USE_TEST +enum { + RUN_TYPE_U, + RUN_TYPE_L, + RUN_TYPE_UF, + RUN_TYPE_LF, + RUN_TYPE_UL, + RUN_TYPE_LSU, + RUN_TYPE_U2L_399_EXT2, + RUN_TYPE_UF_D20, + RUN_TYPE_UF_D1_EXT, + RUN_TYPE_U_EXT, + RUN_TYPE_LF_EXT, + RUN_TYPE_U_EXT2, + RUN_TYPE_L_EXT2, + RUN_TYPE_U_EXT3, +}; +#endif + +const char *run_type_str[] = { + "U", + "L", + "UF", + "LF", + "UL", + "LSU", + "U2L_399_EXT2", + "UF_D20", + "UF_D1_EXT", + "U_EXT", + "LF_EXT", + "U_EXT2", + "L_EXT2", + "U_EXT3", +}; + +typedef struct { + int code; + int len; + int type; + int data; + int ext_len; + int ext_data[3]; + int data_index; /* 'data' coming from the table */ +} TableEntry; + +/* code (17), len (7), type (4) */ + +void find_run_type(TableEntry *te, CCInfo *tab, int code) +{ + int is_lower, len; + CCInfo *ci, *ci1, *ci2; + + ci = &tab[code]; + ci1 = &tab[code + 1]; + ci2 = &tab[code + 2]; + te->code = code; + + if (ci->l_len == 1 && ci->l_data[0] == code + 2 && + ci->f_code == ci->l_data[0] && + ci->u_len == 0 && + + ci1->l_len == 1 && ci1->l_data[0] == code + 2 && + ci1->f_code == ci1->l_data[0] && + ci1->u_len == 1 && ci1->u_data[0] == code && + + ci2->l_len == 0 && + ci2->f_code == 0 && + ci2->u_len == 1 && ci2->u_data[0] == code) { + te->len = 3; + te->data = 0; + te->type = RUN_TYPE_LSU; + return; + } + + if (is_complicated_case(ci)) { + len = 1; + while (code + len <= CHARCODE_MAX) { + ci1 = &tab[code + len]; + if (ci1->u_len != 1 || + ci1->u_data[0] != ci->u_data[0] + len || + ci1->l_len != 0 || + ci1->f_code != ci1->u_data[0]) + break; + len++; + } + if (len > 1) { + te->len = len; + te->type = RUN_TYPE_UF; + te->data = ci->u_data[0]; + return; + } + + if (ci->u_len == 2 && ci->u_data[1] == 0x399 && + ci->f_code == 0 && ci->l_len == 0) { + len = 1; + while (code + len <= CHARCODE_MAX) { + ci1 = &tab[code + len]; + if (!(ci1->u_len == 2 && + ci1->u_data[1] == 0x399 && + ci1->u_data[0] == ci->u_data[0] + len && + ci1->f_code == 0 && + ci1->l_len == 0)) + break; + len++; + } + te->len = len; + te->type = RUN_TYPE_U_EXT2; + te->ext_data[0] = ci->u_data[0]; + te->ext_data[1] = ci->u_data[1]; + te->ext_len = 2; + return; + } + + if (ci->u_len == 2 && ci->u_data[1] == 0x399 && + ci->l_len == 1 && ci->f_code == ci->l_data[0]) { + len = 1; + while (code + len <= CHARCODE_MAX) { + ci1 = &tab[code + len]; + if (!(ci1->u_len == 2 && + ci1->u_data[1] == 0x399 && + ci1->u_data[0] == ci->u_data[0] + len && + ci1->l_len == 1 && + ci1->l_data[0] == ci->l_data[0] + len && + ci1->f_code == ci1->l_data[0])) + break; + len++; + } + te->len = len; + te->type = RUN_TYPE_U2L_399_EXT2; + te->ext_data[0] = ci->u_data[0]; + te->ext_data[1] = ci->l_data[0]; + te->ext_len = 2; + return; + } + + if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) { + len = 1; + while (code + len <= CHARCODE_MAX) { + ci1 = &tab[code + len]; + if (!(ci1->l_len == 1 && + ci1->l_data[0] == ci->l_data[0] + len && + ci1->u_len == 0 && ci1->f_code == 0)) + break; + len++; + } + te->len = len; + te->type = RUN_TYPE_L; + te->data = ci->l_data[0]; + return; + } + + if (ci->l_len == 0 && + ci->u_len == 1 && + ci->u_data[0] < 0x1000 && + ci->f_code == ci->u_data[0] + 0x20) { + te->len = 1; + te->type = RUN_TYPE_UF_D20; + te->data = ci->u_data[0]; + } else if (ci->l_len == 0 && + ci->u_len == 1 && + ci->f_code == ci->u_data[0] + 1) { + te->len = 1; + te->type = RUN_TYPE_UF_D1_EXT; + te->ext_data[0] = ci->u_data[0]; + te->ext_len = 1; + } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) { + te->len = 1; + te->type = RUN_TYPE_L_EXT2; + te->ext_data[0] = ci->l_data[0]; + te->ext_data[1] = ci->l_data[1]; + te->ext_len = 2; + } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) { + te->len = 1; + te->type = RUN_TYPE_U_EXT2; + te->ext_data[0] = ci->u_data[0]; + te->ext_data[1] = ci->u_data[1]; + te->ext_len = 2; + } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) { + te->len = 1; + te->type = RUN_TYPE_U_EXT3; + te->ext_data[0] = ci->u_data[0]; + te->ext_data[1] = ci->u_data[1]; + te->ext_data[2] = ci->u_data[2]; + te->ext_len = 3; + } else { + printf("unsupported encoding case:\n"); + dump_cc_info(ci, code); + abort(); + } + } else { + /* look for a run of identical conversions */ + len = 0; + for(;;) { + if (code >= CHARCODE_MAX || len >= 126) + break; + ci = &tab[code + len]; + ci1 = &tab[code + len + 1]; + if (is_complicated_case(ci) || is_complicated_case(ci1)) { + break; + } + if (ci->l_len != 1 || ci->l_data[0] != code + len + 1) + break; + if (ci1->u_len != 1 || ci1->u_data[0] != code + len) + break; + len += 2; + } + if (len > 0) { + te->len = len; + te->type = RUN_TYPE_UL; + te->data = 0; + return; + } + + ci = &tab[code]; + is_lower = ci->l_len > 0; + len = 1; + while (code + len <= CHARCODE_MAX) { + ci1 = &tab[code + len]; + if (is_complicated_case(ci1)) + break; + if (is_lower) { + if (ci1->l_len != 1 || + ci1->l_data[0] != ci->l_data[0] + len) + break; + } else { + if (ci1->u_len != 1 || + ci1->u_data[0] != ci->u_data[0] + len) + break; + } + len++; + } + te->len = len; + if (is_lower) { + te->type = RUN_TYPE_LF; + te->data = ci->l_data[0]; + } else { + te->type = RUN_TYPE_U; + te->data = ci->u_data[0]; + } + } +} + +TableEntry conv_table[1000]; +int conv_table_len; +int ext_data[1000]; +int ext_data_len; + +void dump_case_conv_table1(void) +{ + int i, j; + const TableEntry *te; + + for(i = 0; i < conv_table_len; i++) { + te = &conv_table[i]; + printf("%05x %02x %-10s %05x", + te->code, te->len, run_type_str[te->type], te->data); + for(j = 0; j < te->ext_len; j++) { + printf(" %05x", te->ext_data[j]); + } + printf("\n"); + } + printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len); +} + +int find_data_index(const TableEntry *conv_table, int len, int data) +{ + int i; + const TableEntry *te; + for(i = 0; i < len; i++) { + te = &conv_table[i]; + if (te->code == data) + return i; + } + return -1; +} + +int find_ext_data_index(int data) +{ + int i; + for(i = 0; i < ext_data_len; i++) { + if (ext_data[i] == data) + return i; + } + assert(ext_data_len < countof(ext_data)); + ext_data[ext_data_len++] = data; + return ext_data_len - 1; +} + +void build_conv_table(CCInfo *tab) +{ + int code, i, j; + CCInfo *ci; + TableEntry *te; + + te = conv_table; + for(code = 0; code <= CHARCODE_MAX; code++) { + ci = &tab[code]; + if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0) + continue; + assert(te - conv_table < countof(conv_table)); + find_run_type(te, tab, code); +#if 0 + if (te->type == RUN_TYPE_TODO) { + printf("TODO: "); + dump_cc_info(ci, code); + } +#endif + assert(te->len <= 127); + code += te->len - 1; + te++; + } + conv_table_len = te - conv_table; + + /* find the data index */ + for(i = 0; i < conv_table_len; i++) { + int data_index; + te = &conv_table[i]; + + switch(te->type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + data_index = find_data_index(conv_table, conv_table_len, te->data); + if (data_index < 0) { + switch(te->type) { + case RUN_TYPE_U: + te->type = RUN_TYPE_U_EXT; + te->ext_len = 1; + te->ext_data[0] = te->data; + break; + case RUN_TYPE_LF: + te->type = RUN_TYPE_LF_EXT; + te->ext_len = 1; + te->ext_data[0] = te->data; + break; + default: + printf("%05x: index not found\n", te->code); + exit(1); + } + } else { + te->data_index = data_index; + } + break; + case RUN_TYPE_UF_D20: + te->data_index = te->data; + break; + } + } + + /* find the data index for ext_data */ + for(i = 0; i < conv_table_len; i++) { + te = &conv_table[i]; + if (te->type == RUN_TYPE_U_EXT3) { + int p, v; + v = 0; + for(j = 0; j < 3; j++) { + p = find_ext_data_index(te->ext_data[j]); + assert(p < 16); + v = (v << 4) | p; + } + te->data_index = v; + } + } + + for(i = 0; i < conv_table_len; i++) { + te = &conv_table[i]; + if (te->type == RUN_TYPE_L_EXT2 || + te->type == RUN_TYPE_U_EXT2 || + te->type == RUN_TYPE_U2L_399_EXT2) { + int p, v; + v = 0; + for(j = 0; j < 2; j++) { + p = find_ext_data_index(te->ext_data[j]); + assert(p < 64); + v = (v << 6) | p; + } + te->data_index = v; + } + } + + for(i = 0; i < conv_table_len; i++) { + te = &conv_table[i]; + if (te->type == RUN_TYPE_UF_D1_EXT || + te->type == RUN_TYPE_U_EXT || + te->type == RUN_TYPE_LF_EXT) { + te->data_index = find_ext_data_index(te->ext_data[0]); + } + } +#ifdef DUMP_CASE_CONV_TABLE + dump_case_conv_table1(); +#endif +} + +void dump_case_conv_table(FILE *f) +{ + int i; + uint32_t v; + const TableEntry *te; + + fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len); + for(i = 0; i < conv_table_len; i++) { + if (i % 4 == 0) + fprintf(f, "\n "); + te = &conv_table[i]; + v = te->code << (32 - 17); + v |= te->len << (32 - 17 - 7); + v |= te->type << (32 - 17 - 7 - 4); + v |= te->data_index >> 8; + fprintf(f, " 0x%08x,", v); + } + fprintf(f, "\n};\n\n"); + + fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len); + for(i = 0; i < conv_table_len; i++) { + if (i % 8 == 0) + fprintf(f, "\n "); + te = &conv_table[i]; + fprintf(f, " 0x%02x,", te->data_index & 0xff); + } + fprintf(f, "\n};\n\n"); + + fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len); + for(i = 0; i < ext_data_len; i++) { + if (i % 8 == 0) + fprintf(f, "\n "); + fprintf(f, " 0x%04x,", ext_data[i]); + } + fprintf(f, "\n};\n\n"); +} + +int tabcmp(const int *tab1, const int *tab2, int n) +{ + int i; + for(i = 0; i < n; i++) { + if (tab1[i] != tab2[i]) + return -1; + } + return 0; +} + +void dump_str(const char *str, const int *buf, int len) +{ + int i; + printf("%s=", str); + for(i = 0; i < len; i++) + printf(" %05x", buf[i]); + printf("\n"); +} + +void compute_internal_props(void) +{ + int i; + BOOL has_ul; + + for(i = 0; i <= CHARCODE_MAX; i++) { + CCInfo *ci = &unicode_db[i]; + has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0); + if (has_ul) { + assert(get_prop(i, PROP_Cased)); + } else { + set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased)); + } + set_prop(i, PROP_ID_Continue1, + get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1)); + set_prop(i, PROP_XID_Start1, + get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start)); + set_prop(i, PROP_XID_Continue1, + get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue)); + set_prop(i, PROP_Changes_When_Titlecased1, + get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0)); + set_prop(i, PROP_Changes_When_Casefolded1, + get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0)); + /* XXX: reduce table size (438 bytes) */ + set_prop(i, PROP_Changes_When_NFKC_Casefolded1, + get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0)); +#if 0 + /* TEST */ +#define M(x) (1U << GCAT_ ## x) + { + int b; + b = ((M(Mn) | M(Cf) | M(Lm) | M(Sk)) >> + unicode_db[i].general_category) & 1; + set_prop(i, PROP_Cased1, + get_prop(i, PROP_Case_Ignorable) ^ b); + } +#undef M +#endif + } +} + +void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) +{ + int i; + fprintf(f, "static const uint8_t %s[%d] = {", cname, len); + for(i = 0; i < len; i++) { + if (i % 8 == 0) + fprintf(f, "\n "); + fprintf(f, " 0x%02x,", tab[i]); + } + fprintf(f, "\n};\n\n"); +} + +#define PROP_BLOCK_LEN 32 + +void build_prop_table(FILE *f, int prop_index, BOOL add_index) +{ + int i, j, n, v, offset, code; + DynBuf dbuf_s, *dbuf = &dbuf_s; + DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; + DynBuf dbuf2_s, *dbuf2 = &dbuf2_s; + const uint32_t *buf; + int buf_len, block_end_pos, bit; + char cname[128]; + + dbuf_init(dbuf1); + + for(i = 0; i <= CHARCODE_MAX;) { + v = get_prop(i, prop_index); + j = i + 1; + while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) { + j++; + } + n = j - i; + if (j == (CHARCODE_MAX + 1) && v == 0) + break; /* no need to encode last zero run */ + //printf("%05x: %d %d\n", i, n, v); + dbuf_put_u32(dbuf1, n - 1); + i += n; + } + + dbuf_init(dbuf); + dbuf_init(dbuf2); + buf = (uint32_t *)dbuf1->buf; + buf_len = dbuf1->size / sizeof(buf[0]); + + /* the first value is assumed to be 0 */ + assert(get_prop(0, prop_index) == 0); + + block_end_pos = PROP_BLOCK_LEN; + i = 0; + code = 0; + bit = 0; + while (i < buf_len) { + if (add_index && dbuf->size >= block_end_pos && bit == 0) { + offset = (dbuf->size - block_end_pos); + /* XXX: offset could be larger in case of runs of small + lengths. Could add code to change the encoding to + prevent it at the expense of one byte loss */ + assert(offset <= 7); + v = code | (offset << 21); + dbuf_putc(dbuf2, v); + dbuf_putc(dbuf2, v >> 8); + dbuf_putc(dbuf2, v >> 16); + block_end_pos += PROP_BLOCK_LEN; + } + + v = buf[i]; + code += v + 1; + bit ^= 1; + if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) { + code += buf[i + 1] + 1; + bit ^= 1; + dbuf_putc(dbuf, (v << 3) | buf[i + 1]); + i += 2; + } else if (v < 128) { + dbuf_putc(dbuf, 0x80 + v); + i++; + } else if (v < (1 << 13)) { + dbuf_putc(dbuf, 0x40 + (v >> 8)); + dbuf_putc(dbuf, v); + i++; + } else { + assert(v < (1 << 21)); + dbuf_putc(dbuf, 0x60 + (v >> 16)); + dbuf_putc(dbuf, v >> 8); + dbuf_putc(dbuf, v); + i++; + } + } + + if (add_index) { + /* last index entry */ + v = code; + dbuf_putc(dbuf2, v); + dbuf_putc(dbuf2, v >> 8); + dbuf_putc(dbuf2, v >> 16); + } + +#ifdef DUMP_TABLE_SIZE + printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index], + (int)(dbuf->size + dbuf2->size)); +#endif + snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]); + dump_byte_table(f, cname, dbuf->buf, dbuf->size); + if (add_index) { + snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]); + dump_byte_table(f, cname, dbuf2->buf, dbuf2->size); + } + + dbuf_free(dbuf); + dbuf_free(dbuf1); + dbuf_free(dbuf2); +} + +void build_flags_tables(FILE *f) +{ + build_prop_table(f, PROP_Cased1, TRUE); + build_prop_table(f, PROP_Case_Ignorable, TRUE); + build_prop_table(f, PROP_ID_Start, TRUE); + build_prop_table(f, PROP_ID_Continue1, TRUE); +} + +void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, + const char **tab_short_name) +{ + int i, w, maxw; + + maxw = 0; + for(i = 0; i < len; i++) { + w = strlen(tab_name[i]); + if (tab_short_name[i][0] != '\0') { + w += 1 + strlen(tab_short_name[i]); + } + if (maxw < w) + maxw = w; + } + + /* generate a sequence of strings terminated by an empty string */ + fprintf(f, "static const char %s[] =\n", cname); + for(i = 0; i < len; i++) { + fprintf(f, " \""); + w = fprintf(f, "%s", tab_name[i]); + if (tab_short_name[i][0] != '\0') { + w += fprintf(f, ",%s", tab_short_name[i]); + } + fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, ""); + } + fprintf(f, ";\n\n"); +} + +void build_general_category_table(FILE *f) +{ + int i, v, j, n, n1; + DynBuf dbuf_s, *dbuf = &dbuf_s; + int cw_count, cw_len_count[4], cw_start; + + fprintf(f, "typedef enum {\n"); + for(i = 0; i < GCAT_COUNT; i++) + fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]); + fprintf(f, " UNICODE_GC_COUNT,\n"); + fprintf(f, "} UnicodeGCEnum;\n\n"); + + dump_name_table(f, "unicode_gc_name_table", + unicode_gc_name, GCAT_COUNT, + unicode_gc_short_name); + + + dbuf_init(dbuf); + cw_count = 0; + for(i = 0; i < 4; i++) + cw_len_count[i] = 0; + for(i = 0; i <= CHARCODE_MAX;) { + v = unicode_db[i].general_category; + j = i + 1; + while (j <= CHARCODE_MAX && unicode_db[j].general_category == v) + j++; + n = j - i; + /* compress Lu/Ll runs */ + if (v == GCAT_Lu) { + n1 = 1; + while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) { + n1++; + } + if (n1 > n) { + v = 31; + n = n1; + } + } + // printf("%05x %05x %d\n", i, n, v); + cw_count++; + n--; + cw_start = dbuf->size; + if (n < 7) { + dbuf_putc(dbuf, (n << 5) | v); + } else if (n < 7 + 128) { + n1 = n - 7; + assert(n1 < 128); + dbuf_putc(dbuf, (0xf << 5) | v); + dbuf_putc(dbuf, n1); + } else if (n < 7 + 128 + (1 << 14)) { + n1 = n - (7 + 128); + assert(n1 < (1 << 14)); + dbuf_putc(dbuf, (0xf << 5) | v); + dbuf_putc(dbuf, (n1 >> 8) + 128); + dbuf_putc(dbuf, n1); + } else { + n1 = n - (7 + 128 + (1 << 14)); + assert(n1 < (1 << 22)); + dbuf_putc(dbuf, (0xf << 5) | v); + dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); + dbuf_putc(dbuf, n1 >> 8); + dbuf_putc(dbuf, n1); + } + cw_len_count[dbuf->size - cw_start - 1]++; + i += n + 1; + } +#ifdef DUMP_TABLE_SIZE + printf("general category: %d entries [", + cw_count); + for(i = 0; i < 4; i++) + printf(" %d", cw_len_count[i]); + printf(" ], length=%d bytes\n", (int)dbuf->size); +#endif + + dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size); + + dbuf_free(dbuf); +} + +void build_script_table(FILE *f) +{ + int i, v, j, n, n1, type; + DynBuf dbuf_s, *dbuf = &dbuf_s; + int cw_count, cw_len_count[4], cw_start; + + fprintf(f, "typedef enum {\n"); + for(i = 0; i < SCRIPT_COUNT; i++) + fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]); + fprintf(f, " UNICODE_SCRIPT_COUNT,\n"); + fprintf(f, "} UnicodeScriptEnum;\n\n"); + + i = 1; + dump_name_table(f, "unicode_script_name_table", + unicode_script_name + i, SCRIPT_COUNT - i, + unicode_script_short_name + i); + + dbuf_init(dbuf); + cw_count = 0; + for(i = 0; i < 4; i++) + cw_len_count[i] = 0; + for(i = 0; i <= CHARCODE_MAX;) { + v = unicode_db[i].script; + j = i + 1; + while (j <= CHARCODE_MAX && unicode_db[j].script == v) + j++; + n = j - i; + if (v == 0 && j == (CHARCODE_MAX + 1)) + break; + // printf("%05x %05x %d\n", i, n, v); + cw_count++; + n--; + cw_start = dbuf->size; + if (v == 0) + type = 0; + else + type = 1; + if (n < 96) { + dbuf_putc(dbuf, n | (type << 7)); + } else if (n < 96 + (1 << 12)) { + n1 = n - 96; + assert(n1 < (1 << 12)); + dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7)); + dbuf_putc(dbuf, n1); + } else { + n1 = n - (96 + (1 << 12)); + assert(n1 < (1 << 20)); + dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7)); + dbuf_putc(dbuf, n1 >> 8); + dbuf_putc(dbuf, n1); + } + if (type != 0) + dbuf_putc(dbuf, v); + + cw_len_count[dbuf->size - cw_start - 1]++; + i += n + 1; + } +#if defined(DUMP_TABLE_SIZE) + printf("script: %d entries [", + cw_count); + for(i = 0; i < 4; i++) + printf(" %d", cw_len_count[i]); + printf(" ], length=%d bytes\n", (int)dbuf->size); +#endif + + dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size); + + dbuf_free(dbuf); +} + +void build_script_ext_table(FILE *f) +{ + int i, j, n, n1, script_ext_len; + DynBuf dbuf_s, *dbuf = &dbuf_s; + int cw_count; + + dbuf_init(dbuf); + cw_count = 0; + for(i = 0; i <= CHARCODE_MAX;) { + script_ext_len = unicode_db[i].script_ext_len; + j = i + 1; + while (j <= CHARCODE_MAX && + unicode_db[j].script_ext_len == script_ext_len && + !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext, + script_ext_len)) { + j++; + } + n = j - i; + cw_count++; + n--; + if (n < 128) { + dbuf_putc(dbuf, n); + } else if (n < 128 + (1 << 14)) { + n1 = n - 128; + assert(n1 < (1 << 14)); + dbuf_putc(dbuf, (n1 >> 8) + 128); + dbuf_putc(dbuf, n1); + } else { + n1 = n - (128 + (1 << 14)); + assert(n1 < (1 << 22)); + dbuf_putc(dbuf, (n1 >> 16) + 128 + 64); + dbuf_putc(dbuf, n1 >> 8); + dbuf_putc(dbuf, n1); + } + dbuf_putc(dbuf, script_ext_len); + for(j = 0; j < script_ext_len; j++) + dbuf_putc(dbuf, unicode_db[i].script_ext[j]); + i += n + 1; + } +#ifdef DUMP_TABLE_SIZE + printf("script_ext: %d entries", + cw_count); + printf(", length=%d bytes\n", (int)dbuf->size); +#endif + + dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size); + + dbuf_free(dbuf); +} + +/* the following properties are synthetized so no table is necessary */ +#define PROP_TABLE_COUNT PROP_ASCII + +void build_prop_list_table(FILE *f) +{ + int i; + + for(i = 0; i < PROP_TABLE_COUNT; i++) { + if (i == PROP_ID_Start || + i == PROP_Case_Ignorable || + i == PROP_ID_Continue1) { + /* already generated */ + } else { + build_prop_table(f, i, FALSE); + } + } + + fprintf(f, "typedef enum {\n"); + for(i = 0; i < PROP_COUNT; i++) + fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]); + fprintf(f, " UNICODE_PROP_COUNT,\n"); + fprintf(f, "} UnicodePropertyEnum;\n\n"); + + i = PROP_ASCII_Hex_Digit; + dump_name_table(f, "unicode_prop_name_table", + unicode_prop_name + i, PROP_XID_Start - i + 1, + unicode_prop_short_name + i); + + fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n"); + for(i = 0; i < PROP_TABLE_COUNT; i++) { + fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]); + } + fprintf(f, "};\n\n"); + + fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n"); + for(i = 0; i < PROP_TABLE_COUNT; i++) { + fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]); + } + fprintf(f, "};\n\n"); +} + +#ifdef USE_TEST +int check_conv(uint32_t *res, uint32_t c, int conv_type) +{ + return lre_case_conv(res, c, conv_type); +} + +void check_case_conv(void) +{ + CCInfo *tab = unicode_db; + uint32_t res[3]; + int l, error; + CCInfo ci_s, *ci1, *ci = &ci_s; + int code; + + for(code = 0; code <= CHARCODE_MAX; code++) { + ci1 = &tab[code]; + *ci = *ci1; + if (ci->l_len == 0) { + ci->l_len = 1; + ci->l_data[0] = code; + } + if (ci->u_len == 0) { + ci->u_len = 1; + ci->u_data[0] = code; + } + if (ci->f_code == 0) + ci->f_code = code; + + error = 0; + l = check_conv(res, code, 0); + if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) { + printf("ERROR: L\n"); + error++; + } + l = check_conv(res, code, 1); + if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) { + printf("ERROR: U\n"); + error++; + } + l = check_conv(res, code, 2); + if (l != 1 || res[0] != ci->f_code) { + printf("ERROR: F\n"); + error++; + } + if (error) { + dump_cc_info(ci, code); + exit(1); + } + } +} + +#ifdef PROFILE +static int64_t get_time_ns(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec; +} +#endif + + +void check_flags(void) +{ + int c; + BOOL flag_ref, flag; + for(c = 0; c <= CHARCODE_MAX; c++) { + flag_ref = get_prop(c, PROP_Cased); + flag = lre_is_cased(c); + if (flag != flag_ref) { + printf("ERROR: c=%05x cased=%d ref=%d\n", + c, flag, flag_ref); + exit(1); + } + + flag_ref = get_prop(c, PROP_Case_Ignorable); + flag = lre_is_case_ignorable(c); + if (flag != flag_ref) { + printf("ERROR: c=%05x case_ignorable=%d ref=%d\n", + c, flag, flag_ref); + exit(1); + } + + flag_ref = get_prop(c, PROP_ID_Start); + flag = lre_is_id_start(c); + if (flag != flag_ref) { + printf("ERROR: c=%05x id_start=%d ref=%d\n", + c, flag, flag_ref); + exit(1); + } + + flag_ref = get_prop(c, PROP_ID_Continue); + flag = lre_is_id_continue(c); + if (flag != flag_ref) { + printf("ERROR: c=%05x id_cont=%d ref=%d\n", + c, flag, flag_ref); + exit(1); + } + } +#ifdef PROFILE + { + int64_t ti, count; + ti = get_time_ns(); + count = 0; + for(c = 0x20; c <= 0xffff; c++) { + flag_ref = get_prop(c, PROP_ID_Start); + flag = lre_is_id_start(c); + assert(flag == flag_ref); + count++; + } + ti = get_time_ns() - ti; + printf("flags time=%0.1f ns/char\n", + (double)ti / count); + } +#endif +} + +#endif + +#define CC_BLOCK_LEN 32 + +void build_cc_table(FILE *f) +{ + int i, cc, n, cc_table_len, type, n1; + DynBuf dbuf_s, *dbuf = &dbuf_s; + DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; + int cw_len_tab[3], cw_start, block_end_pos; + uint32_t v; + + dbuf_init(dbuf); + dbuf_init(dbuf1); + cc_table_len = 0; + for(i = 0; i < countof(cw_len_tab); i++) + cw_len_tab[i] = 0; + block_end_pos = CC_BLOCK_LEN; + for(i = 0; i <= CHARCODE_MAX;) { + cc = unicode_db[i].combining_class; + assert(cc <= 255); + /* check increasing values */ + n = 1; + while ((i + n) <= CHARCODE_MAX && + unicode_db[i + n].combining_class == (cc + n)) + n++; + if (n >= 2) { + type = 1; + } else { + type = 0; + n = 1; + while ((i + n) <= CHARCODE_MAX && + unicode_db[i + n].combining_class == cc) + n++; + } + /* no need to encode the last run */ + if (cc == 0 && (i + n - 1) == CHARCODE_MAX) + break; +#ifdef DUMP_CC_TABLE + printf("%05x %6d %d %d\n", i, n, type, cc); +#endif + if (type == 0) { + if (cc == 0) + type = 2; + else if (cc == 230) + type = 3; + } + n1 = n - 1; + + /* add an entry to the index if necessary */ + if (dbuf->size >= block_end_pos) { + v = i | ((dbuf->size - block_end_pos) << 21); + dbuf_putc(dbuf1, v); + dbuf_putc(dbuf1, v >> 8); + dbuf_putc(dbuf1, v >> 16); + block_end_pos += CC_BLOCK_LEN; + } + cw_start = dbuf->size; + if (n1 < 48) { + dbuf_putc(dbuf, n1 | (type << 6)); + } else if (n1 < 48 + (1 << 11)) { + n1 -= 48; + dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6)); + dbuf_putc(dbuf, n1); + } else { + n1 -= 48 + (1 << 11); + assert(n1 < (1 << 20)); + dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6)); + dbuf_putc(dbuf, n1 >> 8); + dbuf_putc(dbuf, n1); + } + cw_len_tab[dbuf->size - cw_start - 1]++; + if (type == 0 || type == 1) + dbuf_putc(dbuf, cc); + cc_table_len++; + i += n; + } + + /* last index entry */ + v = i; + dbuf_putc(dbuf1, v); + dbuf_putc(dbuf1, v >> 8); + dbuf_putc(dbuf1, v >> 16); + + dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size); + dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); + +#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) + printf("CC table: size=%d (%d entries) [", + (int)(dbuf->size + dbuf1->size), + cc_table_len); + for(i = 0; i < countof(cw_len_tab); i++) + printf(" %d", cw_len_tab[i]); + printf(" ]\n"); +#endif + dbuf_free(dbuf); + dbuf_free(dbuf1); +} + +/* maximum length of decomposition: 18 chars (1), then 8 */ +#ifndef USE_TEST +typedef enum { + DECOMP_TYPE_C1, /* 16 bit char */ + DECOMP_TYPE_L1, /* 16 bit char table */ + DECOMP_TYPE_L2, + DECOMP_TYPE_L3, + DECOMP_TYPE_L4, + DECOMP_TYPE_L5, /* XXX: not used */ + DECOMP_TYPE_L6, /* XXX: could remove */ + DECOMP_TYPE_L7, /* XXX: could remove */ + DECOMP_TYPE_LL1, /* 18 bit char table */ + DECOMP_TYPE_LL2, + DECOMP_TYPE_S1, /* 8 bit char table */ + DECOMP_TYPE_S2, + DECOMP_TYPE_S3, + DECOMP_TYPE_S4, + DECOMP_TYPE_S5, + DECOMP_TYPE_I1, /* increment 16 bit char value */ + DECOMP_TYPE_I2_0, + DECOMP_TYPE_I2_1, + DECOMP_TYPE_I3_1, + DECOMP_TYPE_I3_2, + DECOMP_TYPE_I4_1, + DECOMP_TYPE_I4_2, + DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */ + DECOMP_TYPE_B2, + DECOMP_TYPE_B3, + DECOMP_TYPE_B4, + DECOMP_TYPE_B5, + DECOMP_TYPE_B6, + DECOMP_TYPE_B7, + DECOMP_TYPE_B8, + DECOMP_TYPE_B18, + DECOMP_TYPE_LS2, + DECOMP_TYPE_PAT3, + DECOMP_TYPE_S2_UL, + DECOMP_TYPE_LS2_UL, +} DecompTypeEnum; +#endif + +const char *decomp_type_str[] = { + "C1", + "L1", + "L2", + "L3", + "L4", + "L5", + "L6", + "L7", + "LL1", + "LL2", + "S1", + "S2", + "S3", + "S4", + "S5", + "I1", + "I2_0", + "I2_1", + "I3_1", + "I3_2", + "I4_1", + "I4_2", + "B1", + "B2", + "B3", + "B4", + "B5", + "B6", + "B7", + "B8", + "B18", + "LS2", + "PAT3", + "S2_UL", + "LS2_UL", +}; + +const int decomp_incr_tab[4][4] = { + { DECOMP_TYPE_I1, 0, -1 }, + { DECOMP_TYPE_I2_0, 0, 1, -1 }, + { DECOMP_TYPE_I3_1, 1, 2, -1 }, + { DECOMP_TYPE_I4_1, 1, 2, -1 }, +}; + +/* + entry size: + type bits + code 18 + len 7 + compat 1 + type 5 + index 16 + total 47 +*/ + +typedef struct { + int code; + uint8_t len; + uint8_t type; + uint8_t c_len; + uint16_t c_min; + uint16_t data_index; + int cost; /* size in bytes from this entry to the end */ +} DecompEntry; + +int get_decomp_run_size(const DecompEntry *de) +{ + int s; + s = 6; + if (de->type <= DECOMP_TYPE_C1) { + /* nothing more */ + } else if (de->type <= DECOMP_TYPE_L7) { + s += de->len * de->c_len * 2; + } else if (de->type <= DECOMP_TYPE_LL2) { + /* 18 bits per char */ + s += (de->len * de->c_len * 18 + 7) / 8; + } else if (de->type <= DECOMP_TYPE_S5) { + s += de->len * de->c_len; + } else if (de->type <= DECOMP_TYPE_I4_2) { + s += de->c_len * 2; + } else if (de->type <= DECOMP_TYPE_B18) { + s += 2 + de->len * de->c_len; + } else if (de->type <= DECOMP_TYPE_LS2) { + s += de->len * 3; + } else if (de->type <= DECOMP_TYPE_PAT3) { + s += 4 + de->len * 2; + } else if (de->type <= DECOMP_TYPE_S2_UL) { + s += de->len; + } else if (de->type <= DECOMP_TYPE_LS2_UL) { + s += (de->len / 2) * 3; + } else { + abort(); + } + return s; +} + +static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 }; + +/* return -1 if not found */ +int get_short_code(int c) +{ + int i; + if (c < 0x80) { + return c; + } else if (c >= 0x300 && c < 0x350) { + return c - 0x300 + 0x80; + } else { + for(i = 0; i < countof(unicode_short_table); i++) { + if (c == unicode_short_table[i]) + return i + 0x80 + 0x50; + } + return -1; + } +} + +static BOOL is_short(int code) +{ + return get_short_code(code) >= 0; +} + +static BOOL is_short_tab(const int *tab, int len) +{ + int i; + for(i = 0; i < len; i++) { + if (!is_short(tab[i])) + return FALSE; + } + return TRUE; +} + +static BOOL is_16bit(const int *tab, int len) +{ + int i; + for(i = 0; i < len; i++) { + if (tab[i] > 0xffff) + return FALSE; + } + return TRUE; +} + +static uint32_t to_lower_simple(uint32_t c) +{ + /* Latin1 and Cyrillic */ + if (c < 0x100 || (c >= 0x410 && c <= 0x42f)) + c += 0x20; + else + c++; + return c; +} + +/* select best encoding with dynamic programming */ +void find_decomp_run(DecompEntry *tab_de, int i) +{ + DecompEntry de_s, *de = &de_s; + CCInfo *ci, *ci1, *ci2; + int l, j, n, len_max; + + ci = &unicode_db[i]; + l = ci->decomp_len; + if (l == 0) { + tab_de[i].cost = tab_de[i + 1].cost; + return; + } + + /* the offset for the compose table has only 6 bits, so we must + limit if it can be used by the compose table */ + if (!ci->is_compat && !ci->is_excluded && l == 2) + len_max = 64; + else + len_max = 127; + + tab_de[i].cost = 0x7fffffff; + + if (!is_16bit(ci->decomp_data, l)) { + assert(l <= 2); + + n = 1; + for(;;) { + de->code = i; + de->len = n; + de->type = DECOMP_TYPE_LL1 + l - 1; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + /* Note: we accept a hole */ + if (!(ci1->decomp_len == 0 || + (ci1->decomp_len == l && + ci1->is_compat == ci->is_compat))) + break; + n++; + } + return; + } + + if (l <= 7) { + n = 1; + for(;;) { + de->code = i; + de->len = n; + if (l == 1 && n == 1) { + de->type = DECOMP_TYPE_C1; + } else { + assert(l <= 8); + de->type = DECOMP_TYPE_L1 + l - 1; + } + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + /* Note: we accept a hole */ + if (!(ci1->decomp_len == 0 || + (ci1->decomp_len == l && + ci1->is_compat == ci->is_compat && + is_16bit(ci1->decomp_data, l)))) + break; + n++; + } + } + + if (l <= 8 || l == 18) { + int c_min, c_max, c; + c_min = c_max = -1; + n = 1; + for(;;) { + ci1 = &unicode_db[i + n - 1]; + for(j = 0; j < l; j++) { + c = ci1->decomp_data[j]; + if (c == 0x20) { + /* we accept space for Arabic */ + } else if (c_min == -1) { + c_min = c_max = c; + } else { + c_min = min_int(c_min, c); + c_max = max_int(c_max, c); + } + } + if ((c_max - c_min) > 254) + break; + de->code = i; + de->len = n; + if (l == 18) + de->type = DECOMP_TYPE_B18; + else + de->type = DECOMP_TYPE_B1 + l - 1; + de->c_len = l; + de->c_min = c_min; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + if (!(ci1->decomp_len == l && + ci1->is_compat == ci->is_compat)) + break; + n++; + } + } + + /* find an ascii run */ + if (l <= 5 && is_short_tab(ci->decomp_data, l)) { + n = 1; + for(;;) { + de->code = i; + de->len = n; + de->type = DECOMP_TYPE_S1 + l - 1; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + /* Note: we accept a hole */ + if (!(ci1->decomp_len == 0 || + (ci1->decomp_len == l && + ci1->is_compat == ci->is_compat && + is_short_tab(ci1->decomp_data, l)))) + break; + n++; + } + } + + /* check if a single char is increasing */ + if (l <= 4) { + int idx1, idx; + + for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) { + n = 1; + for(;;) { + de->code = i; + de->len = n; + de->type = decomp_incr_tab[l - 1][0] + idx1 - 1; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + if (!(ci1->decomp_len == l && + ci1->is_compat == ci->is_compat)) + goto next1; + for(j = 0; j < l; j++) { + if (j == idx) { + if (ci1->decomp_data[j] != ci->decomp_data[j] + n) + goto next1; + } else { + if (ci1->decomp_data[j] != ci->decomp_data[j]) + goto next1; + } + } + n++; + } + next1: ; + } + } + + if (l == 3) { + n = 1; + for(;;) { + de->code = i; + de->len = n; + de->type = DECOMP_TYPE_PAT3; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + if (!(ci1->decomp_len == l && + ci1->is_compat == ci->is_compat && + ci1->decomp_data[1] <= 0xffff && + ci1->decomp_data[0] == ci->decomp_data[0] && + ci1->decomp_data[l - 1] == ci->decomp_data[l - 1])) + break; + n++; + } + } + + if (l == 2 && is_short(ci->decomp_data[1])) { + n = 1; + for(;;) { + de->code = i; + de->len = n; + de->type = DECOMP_TYPE_LS2; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + if (!((i + n) <= CHARCODE_MAX && n < len_max)) + break; + ci1 = &unicode_db[i + n]; + if (!(ci1->decomp_len == 0 || + (ci1->decomp_len == l && + ci1->is_compat == ci->is_compat && + ci1->decomp_data[0] <= 0xffff && + is_short(ci1->decomp_data[1])))) + break; + n++; + } + } + + if (l == 2) { + BOOL is_16bit; + + n = 0; + is_16bit = FALSE; + for(;;) { + if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max)) + break; + ci1 = &unicode_db[i + n]; + if (!(ci1->decomp_len == l && + ci1->is_compat == ci->is_compat && + is_short(ci1->decomp_data[1]))) + break; + if (!is_16bit && !is_short(ci1->decomp_data[0])) + is_16bit = TRUE; + ci2 = &unicode_db[i + n + 1]; + if (!(ci2->decomp_len == l && + ci2->is_compat == ci->is_compat && + ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) && + ci2->decomp_data[1] == ci1->decomp_data[1])) + break; + n += 2; + de->code = i; + de->len = n; + de->type = DECOMP_TYPE_S2_UL + is_16bit; + de->c_len = l; + de->cost = get_decomp_run_size(de) + tab_de[i + n].cost; + if (de->cost < tab_de[i].cost) { + tab_de[i] = *de; + } + } + } +} + +void put16(uint8_t *data_buf, int *pidx, uint16_t c) +{ + int idx; + idx = *pidx; + data_buf[idx++] = c; + data_buf[idx++] = c >> 8; + *pidx = idx; +} + +void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de) +{ + int i, j, idx, c; + CCInfo *ci; + + idx = *pidx; + de->data_index = idx; + if (de->type <= DECOMP_TYPE_C1) { + ci = &unicode_db[de->code]; + assert(ci->decomp_len == 1); + de->data_index = ci->decomp_data[0]; + } else if (de->type <= DECOMP_TYPE_L7) { + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + for(j = 0; j < de->c_len; j++) { + if (ci->decomp_len == 0) + c = 0; + else + c = ci->decomp_data[j]; + put16(data_buf, &idx, c); + } + } + } else if (de->type <= DECOMP_TYPE_LL2) { + int n, p, k; + n = (de->len * de->c_len * 18 + 7) / 8; + p = de->len * de->c_len * 2; + memset(data_buf + idx, 0, n); + k = 0; + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + for(j = 0; j < de->c_len; j++) { + if (ci->decomp_len == 0) + c = 0; + else + c = ci->decomp_data[j]; + data_buf[idx + k * 2] = c; + data_buf[idx + k * 2 + 1] = c >> 8; + data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2); + k++; + } + } + idx += n; + } else if (de->type <= DECOMP_TYPE_S5) { + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + for(j = 0; j < de->c_len; j++) { + if (ci->decomp_len == 0) + c = 0; + else + c = ci->decomp_data[j]; + c = get_short_code(c); + assert(c >= 0); + data_buf[idx++] = c; + } + } + } else if (de->type <= DECOMP_TYPE_I4_2) { + ci = &unicode_db[de->code]; + assert(ci->decomp_len == de->c_len); + for(j = 0; j < de->c_len; j++) + put16(data_buf, &idx, ci->decomp_data[j]); + } else if (de->type <= DECOMP_TYPE_B18) { + c = de->c_min; + data_buf[idx++] = c; + data_buf[idx++] = c >> 8; + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + for(j = 0; j < de->c_len; j++) { + assert(ci->decomp_len == de->c_len); + c = ci->decomp_data[j]; + if (c == 0x20) { + c = 0xff; + } else { + c -= de->c_min; + assert((uint32_t)c <= 254); + } + data_buf[idx++] = c; + } + } + } else if (de->type <= DECOMP_TYPE_LS2) { + assert(de->c_len == 2); + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + if (ci->decomp_len == 0) + c = 0; + else + c = ci->decomp_data[0]; + put16(data_buf, &idx, c); + + if (ci->decomp_len == 0) + c = 0; + else + c = ci->decomp_data[1]; + c = get_short_code(c); + assert(c >= 0); + data_buf[idx++] = c; + } + } else if (de->type <= DECOMP_TYPE_PAT3) { + ci = &unicode_db[de->code]; + assert(ci->decomp_len == 3); + put16(data_buf, &idx, ci->decomp_data[0]); + put16(data_buf, &idx, ci->decomp_data[2]); + for(i = 0; i < de->len; i++) { + ci = &unicode_db[de->code + i]; + assert(ci->decomp_len == 3); + put16(data_buf, &idx, ci->decomp_data[1]); + } + } else if (de->type <= DECOMP_TYPE_S2_UL) { + for(i = 0; i < de->len; i += 2) { + ci = &unicode_db[de->code + i]; + c = ci->decomp_data[0]; + c = get_short_code(c); + assert(c >= 0); + data_buf[idx++] = c; + c = ci->decomp_data[1]; + c = get_short_code(c); + assert(c >= 0); + data_buf[idx++] = c; + } + } else if (de->type <= DECOMP_TYPE_LS2_UL) { + for(i = 0; i < de->len; i += 2) { + ci = &unicode_db[de->code + i]; + c = ci->decomp_data[0]; + put16(data_buf, &idx, c); + c = ci->decomp_data[1]; + c = get_short_code(c); + assert(c >= 0); + data_buf[idx++] = c; + } + } else { + abort(); + } + *pidx = idx; +} + +#if 0 +void dump_large_char(void) +{ + int i, j; + for(i = 0; i <= CHARCODE_MAX; i++) { + CCInfo *ci = &unicode_db[i]; + for(j = 0; j < ci->decomp_len; j++) { + if (ci->decomp_data[j] > 0xffff) + printf("%05x\n", ci->decomp_data[j]); + } + } +} +#endif + +void build_compose_table(FILE *f, const DecompEntry *tab_de); + +void build_decompose_table(FILE *f) +{ + int i, array_len, code_max, data_len, count; + DecompEntry *tab_de, de_s, *de = &de_s; + uint8_t *data_buf; + + code_max = CHARCODE_MAX; + + tab_de = mallocz((code_max + 2) * sizeof(*tab_de)); + + for(i = code_max; i >= 0; i--) { + find_decomp_run(tab_de, i); + } + + /* build the data buffer */ + data_buf = malloc(100000); + data_len = 0; + array_len = 0; + for(i = 0; i <= code_max; i++) { + de = &tab_de[i]; + if (de->len != 0) { + add_decomp_data(data_buf, &data_len, de); + i += de->len - 1; + array_len++; + } + } + +#ifdef DUMP_DECOMP_TABLE + /* dump */ + { + int size, size1; + + printf("START LEN TYPE L C SIZE\n"); + size = 0; + for(i = 0; i <= code_max; i++) { + de = &tab_de[i]; + if (de->len != 0) { + size1 = get_decomp_run_size(de); + printf("%05x %3d %6s %2d %1d %4d\n", i, de->len, + decomp_type_str[de->type], de->c_len, + unicode_db[i].is_compat, size1); + i += de->len - 1; + size += size1; + } + } + + printf("array_len=%d estimated size=%d bytes actual=%d bytes\n", + array_len, size, array_len * 6 + data_len); + } +#endif + + fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {", + array_len); + count = 0; + for(i = 0; i <= code_max; i++) { + de = &tab_de[i]; + if (de->len != 0) { + uint32_t v; + if (count++ % 4 == 0) + fprintf(f, "\n "); + v = (de->code << (32 - 18)) | + (de->len << (32 - 18 - 7)) | + (de->type << (32 - 18 - 7 - 6)) | + unicode_db[de->code].is_compat; + fprintf(f, " 0x%08x,", v); + i += de->len - 1; + } + } + fprintf(f, "\n};\n\n"); + + fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {", + array_len); + count = 0; + for(i = 0; i <= code_max; i++) { + de = &tab_de[i]; + if (de->len != 0) { + if (count++ % 8 == 0) + fprintf(f, "\n "); + fprintf(f, " 0x%04x,", de->data_index); + i += de->len - 1; + } + } + fprintf(f, "\n};\n\n"); + + fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {", + data_len); + for(i = 0; i < data_len; i++) { + if (i % 8 == 0) + fprintf(f, "\n "); + fprintf(f, " 0x%02x,", data_buf[i]); + } + fprintf(f, "\n};\n\n"); + + build_compose_table(f, tab_de); + + free(data_buf); + + free(tab_de); +} + +typedef struct { + uint32_t c[2]; + uint32_t p; +} ComposeEntry; + +#define COMPOSE_LEN_MAX 10000 + +static int ce_cmp(const void *p1, const void *p2) +{ + const ComposeEntry *ce1 = p1; + const ComposeEntry *ce2 = p2; + int i; + + for(i = 0; i < 2; i++) { + if (ce1->c[i] < ce2->c[i]) + return -1; + else if (ce1->c[i] > ce2->c[i]) + return 1; + } + return 0; +} + + +static int get_decomp_pos(const DecompEntry *tab_de, int c) +{ + int i, v, k; + const DecompEntry *de; + + k = 0; + for(i = 0; i <= CHARCODE_MAX; i++) { + de = &tab_de[i]; + if (de->len != 0) { + if (c >= de->code && c < de->code + de->len) { + v = c - de->code; + assert(v < 64); + v |= k << 6; + assert(v < 65536); + return v; + } + i += de->len - 1; + k++; + } + } + return -1; +} + +void build_compose_table(FILE *f, const DecompEntry *tab_de) +{ + int i, v, tab_ce_len; + ComposeEntry *ce, *tab_ce; + + tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX); + tab_ce_len = 0; + for(i = 0; i <= CHARCODE_MAX; i++) { + CCInfo *ci = &unicode_db[i]; + if (ci->decomp_len == 2 && !ci->is_compat && + !ci->is_excluded) { + assert(tab_ce_len < COMPOSE_LEN_MAX); + ce = &tab_ce[tab_ce_len++]; + ce->c[0] = ci->decomp_data[0]; + ce->c[1] = ci->decomp_data[1]; + ce->p = i; + } + } + qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp); + +#if 0 + { + printf("tab_ce_len=%d\n", tab_ce_len); + for(i = 0; i < tab_ce_len; i++) { + ce = &tab_ce[i]; + printf("%05x %05x %05x\n", ce->c[0], ce->c[1], ce->p); + } + } +#endif + + fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", + tab_ce_len); + for(i = 0; i < tab_ce_len; i++) { + if (i % 8 == 0) + fprintf(f, "\n "); + v = get_decomp_pos(tab_de, tab_ce[i].p); + if (v < 0) { + printf("ERROR: entry for c=%04x not found\n", + tab_ce[i].p); + exit(1); + } + fprintf(f, " 0x%04x,", v); + } + fprintf(f, "\n};\n\n"); + + free(tab_ce); +} + +#ifdef USE_TEST +void check_decompose_table(void) +{ + int c; + CCInfo *ci; + int res[UNICODE_DECOMP_LEN_MAX], *ref; + int len, ref_len, is_compat; + + for(is_compat = 0; is_compat <= 1; is_compat++) { + for(c = 0; c < CHARCODE_MAX; c++) { + ci = &unicode_db[c]; + ref_len = ci->decomp_len; + ref = ci->decomp_data; + if (!is_compat && ci->is_compat) { + ref_len = 0; + } + len = unicode_decomp_char((uint32_t *)res, c, is_compat); + if (len != ref_len || + tabcmp(res, ref, ref_len) != 0) { + printf("ERROR c=%05x compat=%d\n", c, is_compat); + dump_str("res", res, len); + dump_str("ref", ref, ref_len); + exit(1); + } + } + } +} + +void check_compose_table(void) +{ + int i, p; + /* XXX: we don't test all the cases */ + + for(i = 0; i <= CHARCODE_MAX; i++) { + CCInfo *ci = &unicode_db[i]; + if (ci->decomp_len == 2 && !ci->is_compat && + !ci->is_excluded) { + p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]); + if (p != i) { + printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n", + ci->decomp_data[0], ci->decomp_data[1], p, i); + exit(1); + } + } + } + + + +} + +#endif + + + +#ifdef USE_TEST + +void check_str(const char *msg, int num, const int *in_buf, int in_len, + const int *buf1, int len1, + const int *buf2, int len2) +{ + if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) { + printf("%d: ERROR %s:\n", num, msg); + dump_str(" in", in_buf, in_len); + dump_str("res", buf1, len1); + dump_str("ref", buf2, len2); + exit(1); + } +} + +void check_cc_table(void) +{ + int cc, cc_ref, c; + + for(c = 0; c <= CHARCODE_MAX; c++) { + cc_ref = unicode_db[c].combining_class; + cc = unicode_get_cc(c); + if (cc != cc_ref) { + printf("ERROR: c=%04x cc=%d cc_ref=%d\n", + c, cc, cc_ref); + exit(1); + } + } +#ifdef PROFILE + { + int64_t ti, count; + + ti = get_time_ns(); + count = 0; + /* only do it on meaningful chars */ + for(c = 0x20; c <= 0xffff; c++) { + cc_ref = unicode_db[c].combining_class; + cc = unicode_get_cc(c); + count++; + } + ti = get_time_ns() - ti; + printf("cc time=%0.1f ns/char\n", + (double)ti / count); + } +#endif +} + +void normalization_test(const char *filename) +{ + FILE *f; + char line[4096], *p; + int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str; + int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len; + int *buf, buf_len, pos; + + f = fopen(filename, "rb"); + if (!f) { + perror(filename); + exit(1); + } + pos = 0; + for(;;) { + if (!get_line(line, sizeof(line), f)) + break; + pos++; + p = line; + while (isspace(*p)) + p++; + if (*p == '#' || *p == '@') + continue; + in_str = get_field_str(&in_len, p, 0); + nfc_str = get_field_str(&nfc_len, p, 1); + nfd_str = get_field_str(&nfd_len, p, 2); + nfkc_str = get_field_str(&nfkc_len, p, 3); + nfkd_str = get_field_str(&nfkd_len, p, 4); + + // dump_str("in", in_str, in_len); + + buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL); + check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len); + free(buf); + + buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL); + check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len); + free(buf); + + buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL); + check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len); + free(buf); + + buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL); + check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len); + free(buf); + + free(in_str); + free(nfc_str); + free(nfd_str); + free(nfkc_str); + free(nfkd_str); + } + fclose(f); +} +#endif + +int main(int argc, char **argv) +{ + const char *unicode_db_path, *outfilename; + char filename[1024]; + + if (argc < 2) { + printf("usage: %s unicode_db_path [output_file]\n" + "\n" + "If no output_file is given, a self test is done using the current unicode library\n", + argv[0]); + exit(1); + } + unicode_db_path = argv[1]; + outfilename = NULL; + if (argc >= 3) + outfilename = argv[2]; + + unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1)); + + snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path); + + parse_unicode_data(filename); + + snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path); + parse_special_casing(unicode_db, filename); + + snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path); + parse_case_folding(unicode_db, filename); + + snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path); + parse_composition_exclusions(filename); + + snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path); + parse_derived_core_properties(filename); + + snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path); + parse_derived_norm_properties(filename); + + snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path); + parse_prop_list(filename); + + snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path); + parse_scripts(filename); + + snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt", + unicode_db_path); + parse_script_extensions(filename); + + snprintf(filename, sizeof(filename), "%s/emoji-data.txt", + unicode_db_path); + parse_prop_list(filename); + + // dump_data(unicode_db); + + build_conv_table(unicode_db); + + // dump_table(); + + if (!outfilename) { +#ifdef USE_TEST + check_case_conv(); + check_flags(); + check_decompose_table(); + check_compose_table(); + check_cc_table(); + snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path); + normalization_test(filename); +#else + fprintf(stderr, "Tests are not compiled\n"); + exit(1); +#endif + } else + { + FILE *fo = fopen(outfilename, "wb"); + + if (!fo) { + perror(outfilename); + exit(1); + } + fprintf(fo, + "/* Compressed unicode tables */\n" + "/* Automatically generated file - do not edit */\n" + "\n" + "#include <stdint.h>\n" + "\n"); + dump_case_conv_table(fo); + compute_internal_props(); + build_flags_tables(fo); + fprintf(fo, "#ifdef CONFIG_ALL_UNICODE\n\n"); + build_cc_table(fo); + build_decompose_table(fo); + build_general_category_table(fo); + build_script_table(fo); + build_script_ext_table(fo); + build_prop_list_table(fo); + fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n"); + fclose(fo); + } + return 0; +} |