diff options
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r-- | src/backend/utils/adt/varlena.c | 368 |
1 files changed, 47 insertions, 321 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 170b3a3820b..4ca823ca7b1 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -1553,10 +1553,6 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) } else { - char a1buf[TEXTBUFLEN]; - char a2buf[TEXTBUFLEN]; - char *a1p, - *a2p; pg_locale_t mylocale; mylocale = pg_newlocale_from_collation(collid); @@ -1573,171 +1569,16 @@ varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid) if (len1 == len2 && memcmp(arg1, arg2, len1) == 0) return 0; -#ifdef WIN32 - /* Win32 does not have UTF-8, so we need to map to UTF-16 */ - if (GetDatabaseEncoding() == PG_UTF8 - && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC)) - { - int a1len; - int a2len; - int r; - - if (len1 >= TEXTBUFLEN / 2) - { - a1len = len1 * 2 + 2; - a1p = palloc(a1len); - } - else - { - a1len = TEXTBUFLEN; - a1p = a1buf; - } - if (len2 >= TEXTBUFLEN / 2) - { - a2len = len2 * 2 + 2; - a2p = palloc(a2len); - } - else - { - a2len = TEXTBUFLEN; - a2p = a2buf; - } - - /* stupid Microsloth API does not work for zero-length input */ - if (len1 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, - (LPWSTR) a1p, a1len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a1p)[r] = 0; - - if (len2 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, - (LPWSTR) a2p, a2len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a2p)[r] = 0; - - errno = 0; -#ifdef HAVE_LOCALE_T - if (mylocale) - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt); - else -#endif - result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p); - if (result == 2147483647) /* _NLSCMPERROR; missing from mingw - * headers */ - ereport(ERROR, - (errmsg("could not compare Unicode strings: %m"))); - - /* Break tie if necessary. */ - if (result == 0 && - (!mylocale || mylocale->deterministic)) - { - result = memcmp(arg1, arg2, Min(len1, len2)); - if ((result == 0) && (len1 != len2)) - result = (len1 < len2) ? -1 : 1; - } - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); - - return result; - } -#endif /* WIN32 */ - - if (len1 >= TEXTBUFLEN) - a1p = (char *) palloc(len1 + 1); - else - a1p = a1buf; - if (len2 >= TEXTBUFLEN) - a2p = (char *) palloc(len2 + 1); - else - a2p = a2buf; - - memcpy(a1p, arg1, len1); - a1p[len1] = '\0'; - memcpy(a2p, arg2, len2); - a2p[len2] = '\0'; - - if (mylocale) - { - if (mylocale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(mylocale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, arg1, len1); - ulen2 = icu_to_uchar(&uchar2, arg2, len2); - - result = ucol_strcoll(mylocale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(a1p, a2p, mylocale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", mylocale->provider); -#endif - } - } - else - result = strcoll(a1p, a2p); + result = pg_strncoll(arg1, len1, arg2, len2, mylocale); /* Break tie if necessary. */ if (result == 0 && (!mylocale || mylocale->deterministic)) - result = strcmp(a1p, a2p); - - if (a1p != a1buf) - pfree(a1p); - if (a2p != a2buf) - pfree(a2p); + { + result = memcmp(arg1, arg2, Min(len1, len2)); + if ((result == 0) && (len1 != len2)) + result = (len1 < len2) ? -1 : 1; + } } return result; @@ -2074,20 +1915,6 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) locale = pg_newlocale_from_collation(collid); /* - * There is a further exception on Windows. When the database - * encoding is UTF-8 and we are not using the C collation, complex - * hacks are required. We don't currently have a comparator that - * handles that case, so we fall back on the slow method of having the - * sort code invoke bttextcmp() (in the case of text) via the fmgr - * trampoline. ICU locales work just the same on Windows, however. - */ -#ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8 && - !(locale && locale->provider == COLLPROVIDER_ICU)) - return; -#endif - - /* * We use varlenafastcmp_locale except for type NAME. */ if (typid == NAMEOID) @@ -2102,13 +1929,7 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) /* * Unfortunately, it seems that abbreviation for non-C collations is - * broken on many common platforms; testing of multiple versions of glibc - * reveals that, for many locales, strcoll() and strxfrm() do not return - * consistent results, which is fatal to this optimization. While no - * other libc other than Cygwin has so far been shown to have a problem, - * we take the conservative course of action for right now and disable - * this categorically. (Users who are certain this isn't a problem on - * their system can define TRUST_STRXFRM.) + * broken on many common platforms; see pg_strxfrm_enabled(). * * Even apart from the risk of broken locales, it's possible that there * are platforms where the use of abbreviated keys should be disabled at @@ -2121,10 +1942,8 @@ varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid) * categorically, we may still want or need to disable it for particular * platforms. */ -#ifndef TRUST_STRXFRM - if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU)) + if (!collate_c && !pg_strxfrm_enabled(locale)) abbreviate = false; -#endif /* * If we're using abbreviated keys, or if we're using a locale-aware @@ -2395,60 +2214,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) return sss->last_returned; } - if (sss->locale) - { - if (sss->locale->provider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(sss->locale->info.icu.ucol, - a1p, len1, - a2p, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - int32_t ulen1, - ulen2; - UChar *uchar1, - *uchar2; - - ulen1 = icu_to_uchar(&uchar1, a1p, len1); - ulen2 = icu_to_uchar(&uchar2, a2p, len2); - - result = ucol_strcoll(sss->locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - pfree(uchar1); - pfree(uchar2); - } -#else /* not USE_ICU */ - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif /* not USE_ICU */ - } - else - { -#ifdef HAVE_LOCALE_T - result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt); -#else - /* shouldn't happen */ - elog(ERROR, "unsupported collprovider: %c", sss->locale->provider); -#endif - } - } - else - result = strcoll(sss->buf1, sss->buf2); + result = pg_strcoll(sss->buf1, sss->buf2, sss->locale); /* Break tie if necessary. */ if (result == 0 && @@ -2471,6 +2237,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup) static Datum varstr_abbrev_convert(Datum original, SortSupport ssup) { + const size_t max_prefix_bytes = sizeof(Datum); VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra; VarString *authoritative = DatumGetVarStringPP(original); char *authoritative_data = VARDATA_ANY(authoritative); @@ -2483,7 +2250,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) pres = (char *) &res; /* memset(), so any non-overwritten bytes are NUL */ - memset(pres, 0, sizeof(Datum)); + memset(pres, 0, max_prefix_bytes); len = VARSIZE_ANY_EXHDR(authoritative); /* Get number of bytes, ignoring trailing spaces */ @@ -2518,14 +2285,10 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * thing: explicitly consider string length. */ if (sss->collate_c) - memcpy(pres, authoritative_data, Min(len, sizeof(Datum))); + memcpy(pres, authoritative_data, Min(len, max_prefix_bytes)); else { Size bsize; -#ifdef USE_ICU - int32_t ulen = -1; - UChar *uchar = NULL; -#endif /* * We're not using the C collation, so fall back on strxfrm or ICU @@ -2543,7 +2306,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) if (sss->last_len1 == len && sss->cache_blob && memcmp(sss->buf1, authoritative_data, len) == 0) { - memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2)); + memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2)); /* No change affecting cardinality, so no hashing required */ goto done; } @@ -2551,81 +2314,49 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) memcpy(sss->buf1, authoritative_data, len); /* - * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not - * necessary for ICU, but doesn't hurt. + * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated + * strings. */ sss->buf1[len] = '\0'; sss->last_len1 = len; -#ifdef USE_ICU - /* When using ICU and not UTF8, convert string to UChar. */ - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU && - GetDatabaseEncoding() != PG_UTF8) - ulen = icu_to_uchar(&uchar, sss->buf1, len); -#endif - - /* - * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer, - * and try again. Both of these functions have the result buffer - * content undefined if the result did not fit, so we need to retry - * until everything fits, even though we only need the first few bytes - * in the end. When using ucol_nextSortKeyPart(), however, we only - * ask for as many bytes as we actually need. - */ - for (;;) + if (pg_strxfrm_prefix_enabled(sss->locale)) { -#ifdef USE_ICU - if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU) + if (sss->buflen2 < max_prefix_bytes) { - /* - * When using UTF8, use the iteration interface so we only - * need to produce as many bytes as we actually need. - */ - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - - uiter_setUTF8(&iter, sss->buf1, len); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol, - &iter, - state, - (uint8_t *) sss->buf2, - Min(sizeof(Datum), sss->buflen2), - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - bsize = ucol_getSortKey(sss->locale->info.icu.ucol, - uchar, ulen, - (uint8_t *) sss->buf2, sss->buflen2); + sss->buflen2 = Max(max_prefix_bytes, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); } - else -#endif -#ifdef HAVE_LOCALE_T - if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC) - bsize = strxfrm_l(sss->buf2, sss->buf1, - sss->buflen2, sss->locale->info.lt); - else -#endif - bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2); - - sss->last_len2 = bsize; - if (bsize < sss->buflen2) - break; + bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1, + max_prefix_bytes, sss->locale); + } + else + { /* - * Grow buffer and retry. + * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try + * again. The pg_strxfrm() function leaves the result buffer + * content undefined if the result did not fit, so we need to + * retry until everything fits, even though we only need the first + * few bytes in the end. */ - sss->buflen2 = Max(bsize + 1, - Min(sss->buflen2 * 2, MaxAllocSize)); - sss->buf2 = repalloc(sss->buf2, sss->buflen2); + for (;;) + { + bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2, + sss->locale); + + sss->last_len2 = bsize; + if (bsize < sss->buflen2) + break; + + /* + * Grow buffer and retry. + */ + sss->buflen2 = Max(bsize + 1, + Min(sss->buflen2 * 2, MaxAllocSize)); + sss->buf2 = repalloc(sss->buf2, sss->buflen2); + } } /* @@ -2637,12 +2368,7 @@ varstr_abbrev_convert(Datum original, SortSupport ssup) * (Actually, even if there were NUL bytes in the blob it would be * okay. See remarks on bytea case above.) */ - memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize)); - -#ifdef USE_ICU - if (uchar) - pfree(uchar); -#endif + memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize)); } /* |