The builtin provider uses built-in operations. Only
- the C and C.UTF-8 locales are
- supported for this provider.
+ the C , C.UTF-8 , and
+ PG_UNICODE_FAST locales are supported for this
+ provider.
The C locale behavior is identical to the
regular expression character classes are based on the "POSIX
Compatible" semantics, and the case mapping is the "simple" variant.
+ The PG_UNICODE_FAST locale is available only when
+ the database encoding is UTF-8 , and the behavior is
+ based on Unicode. The collation uses the code point values only. The
+ regular expression character classes are based on the "Standard"
+ semantics, and the case mapping is the "full" variant.
+
+
+ pg_unicode_fast
+
+ This collation sorts by Unicode code point values rather than natural
+ language order. For the functions lower ,
+ initcap , and upper it uses
+ Unicode full case mapping. For pattern matching (including regular
+ expressions), it uses the Standard variant of Unicode
+ url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
+ Properties. Behavior is efficient and stable within a
+
Postgres major version. It is only
+ available for encoding UTF8 .
+
+
+
+
pg_c_utf8
If provider is builtin ,
then locale must be specified and set to
- either C or C.UTF-8 .
+ either C , C.UTF-8 or
+ PG_UNICODE_FAST .
If is
builtin , then locale or
builtin_locale must be specified and set to
- either C or C.UTF-8 .
+ either C , C.UTF-8 , or
+ PG_UNICODE_FAST .
The locales available for the builtin provider are
- C and C.UTF-8 .
+ C , C.UTF-8 and
+ PG_UNICODE_FAST .
If --locale-provider is builtin ,
--locale or --builtin-locale must be
- specified and set to C or
- C.UTF-8 .
+ specified and set to C , C.UTF-8
+ or PG_UNICODE_FAST .
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isdigit(c, true );
+ return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full );
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isalnum(c, true );
+ return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full );
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_ispunct(c, true );
+ return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full );
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
{
if (strcmp(locale, "C") == 0)
return -1;
- if (strcmp(locale, "C.UTF-8") == 0)
+ else if (strcmp(locale, "C.UTF-8") == 0)
return PG_UTF8;
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ return PG_UTF8;
+
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
canonical_name = "C";
else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
canonical_name = "C.UTF-8";
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ canonical_name = "PG_UNICODE_FAST";
if (!canonical_name)
ereport(ERROR,
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strlower(dest, destsize, src, srclen, false);
+ return unicode_strlower(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
size_t
.prev_alnum = false,
};
- return unicode_strtitle(dest, destsize, src, srclen, false,
+ return unicode_strtitle(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full,
initcap_wbnext, &wbstate);
}
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strupper(dest, destsize, src, srclen, false);
+ return unicode_strupper(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
pg_locale_t
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
result->info.builtin.locale = MemoryContextStrdup(context, locstr);
+ result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
result->provider = COLLPROVIDER_BUILTIN;
result->deterministic = true;
result->collate_is_c = true;
return "1";
else if (strcmp(collcollate, "C.UTF-8") == 0)
return "1";
+ else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
+ return "1";
else
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "C.UTF8") == 0)
canonname = "C.UTF-8";
+ else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
+ canonname = "PG_UNICODE_FAST";
else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
if (locale_provider == COLLPROVIDER_BUILTIN)
{
- if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
+ if ((strcmp(datlocale, "C.UTF-8") == 0 ||
+ strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
+ encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202501162
+#define CATALOG_VERSION_NO 202501171
#endif
descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
colllocale => 'C.UTF-8', collversion => '1' },
+{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',
+ collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
+ colllocale => 'PG_UNICODE_FAST', collversion => '1' },
]
struct
{
const char *locale;
+ bool casemap_full;
} builtin;
locale_t lt;
#ifdef USE_ICU
t
(1 row)
+--
+-- Test PG_UNICODE_FAST
+--
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+ERROR: invalid locale name "unicode" for builtin provider
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+ t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes
+-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+---------------
+ abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
+ ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
+ DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
+ ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
+ ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
+ ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
+(6 rows)
+
+DROP TABLE test_pg_unicode_fast;
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+ lower
+-------
+ ας
+(1 row)
+
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+ lower
+-------
+ ας0
+(1 row)
+
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+ lower
+-------
+ ἀς̓
+(1 row)
+
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+ lower
+-------
+ ᾳςͅ
+(1 row)
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+ lower
+-------
+ σ
+(1 row)
+
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+ lower
+-------
+ 0σ
+(1 row)
+
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+ lower
+-------
+ ασα
+(1 row)
+
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+ lower
+-------
+ ἀσ̓α
+(1 row)
+
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+ lower
+-------
+ ᾳσͅα
+(1 row)
+
+-- properties
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+-- case mapping
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
+ ?column?
+----------
+ t
+(1 row)
+
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
+
+--
+-- Test PG_UNICODE_FAST
+--
+
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+
+DROP TABLE test_pg_unicode_fast;
+
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+
+-- properties
+
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+
+-- case mapping
+
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed