Commit c6587319 authored by Alexandre Julliard's avatar Alexandre Julliard

unicode: Add support for high Unicode planes in decomposition tables.

parent 148f564d
...@@ -6257,17 +6257,19 @@ static void test_NormalizeString(void) ...@@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) ); dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
SetLastError( 0xdeadbeef ); SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 ); dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen ); todo_wine
ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine_if (i == 0 || i == 2)
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() ); ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
if (pRtlNormalizeString) if (pRtlNormalizeString)
{ {
...@@ -6277,17 +6279,18 @@ static void test_NormalizeString(void) ...@@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
dstlen = ARRAY_SIZE(dst); dstlen = ARRAY_SIZE(dst);
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status ); ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
dstlen = 1; dstlen = 1;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine_if( i == 0 || i == 2)
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status ); ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
todo_wine_if (i != 3) todo_wine_if( i != 3)
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen ); ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
dstlen = 2; dstlen = 2;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen ); status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status ); ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen ); ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
} }
} }
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch ) ...@@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
} }
static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len ) static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
{ {
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf); unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
unsigned short start = table[offset]; unsigned short start = table[offset];
unsigned short end = table[offset + 1]; unsigned short end = table[offset + 1];
if ((*len = end - start)) return table + start; if ((*len = end - start)) return table + start;
*len = 1; *len = 1 + (ch >= 0x10000);
return NULL; return NULL;
} }
...@@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c ) ...@@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
} }
static BOOL is_starter( WCHAR c ) static BOOL is_starter( unsigned int c )
{ {
return !get_combining_class( c ); return !get_combining_class( c );
} }
static BOOL reorderable_pair( WCHAR c1, WCHAR c2 ) static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
{ {
BYTE ccc1, ccc2; BYTE ccc1, ccc2;
...@@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 ) ...@@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
return ccc2 && (ccc1 > ccc2); return ccc2 && (ccc1 > ccc2);
} }
static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
{
if (IS_HIGH_SURROGATE( src[0] ))
{
if (srclen <= 1) return 0;
if (!IS_LOW_SURROGATE( src[1] )) return 0;
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
return 2;
}
if (IS_LOW_SURROGATE( src[0] )) return 0;
*ch = src[0];
return 1;
}
static void put_utf16( WCHAR *dst, unsigned int ch )
{
if (ch >= 0x10000)
{
ch -= 0x10000;
dst[0] = 0xd800 | (ch >> 10);
dst[1] = 0xdc00 | (ch & 0x3ff);
}
else dst[0] = ch;
}
static void canonical_order_substring( WCHAR *str, unsigned int len ) static void canonical_order_substring( WCHAR *str, unsigned int len )
{ {
unsigned int i; unsigned int i, ch1, ch2, len1, len2;
BOOL swapped; BOOL swapped;
do do
{ {
swapped = FALSE; swapped = FALSE;
for (i = 0; i < len - 1; i++) for (i = 0; i < len - 1; i += len1)
{ {
if (reorderable_pair( str[i], str[i + 1] )) if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
if (i + len1 >= len) break;
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
if (reorderable_pair( ch1, ch2 ))
{ {
WCHAR tmp = str[i]; WCHAR tmp[2];
str[i] = str[i + 1]; memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
str[i + 1] = tmp; memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
swapped = TRUE; swapped = TRUE;
i += len2 - len1;
} }
} }
} while (swapped); } while (swapped);
...@@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len ) ...@@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
*/ */
static void canonical_order_string( WCHAR *str, unsigned int len ) static void canonical_order_string( WCHAR *str, unsigned int len )
{ {
unsigned int i, next = 0; unsigned int ch, i, r, next = 0;
for (i = 1; i <= len; i++) for (i = 0; i < len; i += r)
{ {
if (i == len || is_starter( str[i] )) if (!(r = get_utf16( str + i, len - i, &ch ))) return;
if (i && is_starter( ch ))
{ {
if (i > next + 1) /* at least two successive non-starters */ if (i > next + 1) /* at least two successive non-starters */
canonical_order_substring( str + next, i - next ); canonical_order_substring( str + next, i - next );
next = i + 1; next = i + r;
} }
} }
if (i > next + 1) canonical_order_substring( str + next, i - next );
} }
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len ) static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
{ {
const unsigned short *table = compat ? nfkd_table : nfd_table; const unsigned short *table = compat ? nfkd_table : nfd_table;
int src_pos, dst_pos = 0; int src_pos, dst_pos;
unsigned int decomp_len; unsigned int ch, len, decomp_len;
const WCHAR *decomp; const WCHAR *decomp;
for (src_pos = 0; src_pos < src_len; src_pos++) for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
{ {
if (dst_pos == *dst_len) break; if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
if ((decomp = get_decomposition( table, src[src_pos], &decomp_len ))) (ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
{ {
if (dst_pos + decomp_len > *dst_len) break; *dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) ); return STATUS_NO_UNICODE_TRANSLATION;
} }
else dst[dst_pos] = src[src_pos]; decomp = get_decomposition( table, ch, &decomp_len );
dst_pos += decomp_len; if (dst_pos + decomp_len > *dst_len) break;
if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
else put_utf16( dst + dst_pos, ch );
} }
if (src_pos < src_len) if (src_pos < src_len)
{ {
*dst_len += (src_len - src_pos) * (compat ? 18 : 3); *dst_len += (src_len - src_pos) * (compat ? 18 : 3);
...@@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons ...@@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
} }
/* get the next char value taking surrogates into account */
static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
{
if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
{
if (src[0] > 0xdbff || /* invalid high surrogate */
srclen <= 1 || /* missing low surrogate */
src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
return 0;
return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
}
return src[0];
}
/************************************************************************** /**************************************************************************
* RtlUnicodeToUTF8N (NTDLL.@) * RtlUnicodeToUTF8N (NTDLL.@)
*/ */
...@@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const ...@@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */ else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
else else
{ {
if (!(val = get_surrogate_value( src, srclen ))) if (!get_utf16( src, srclen, &val ))
{ {
val = 0xfffd; val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED; status = STATUS_SOME_NOT_MAPPED;
...@@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const ...@@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
dst += 2; dst += 2;
continue; continue;
} }
if (!(val = get_surrogate_value( src, srclen ))) if (!get_utf16( src, srclen, &val ))
{ {
val = 0xfffd; val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED; status = STATUS_SOME_NOT_MAPPED;
......
...@@ -480,7 +480,7 @@ sub build_decompositions(@) ...@@ -480,7 +480,7 @@ sub build_decompositions(@)
my @src = @_; my @src = @_;
my @dst; my @dst;
for (my $i = 0; $i < 65536; $i++) for (my $i = 0; $i < @src; $i++)
{ {
next unless defined $src[$i]; next unless defined $src[$i];
my @decomp = get_decomposition( $i, \@src ); my @decomp = get_decomposition( $i, \@src );
...@@ -2092,10 +2092,13 @@ sub dump_decompositions($@) ...@@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
# first determine all the 16-char subsets that contain something # first determine all the 16-char subsets that contain something
my @filled = (0) x 4096; my $level1 = ($MAX_CHAR + 1) / 16;
my $level2 = $level1 / 16;
my @filled = (0) x $level1;
my $pos = 16; # for the null subset my $pos = 16; # for the null subset
my $data_total = 0; my $data_total = 0;
for (my $i = 0; $i < 65536; $i++) for (my $i = 0; $i <= $MAX_CHAR; $i++)
{ {
next unless defined $decomp[$i]; next unless defined $decomp[$i];
if ($filled[$i >> 4] == 0) if ($filled[$i >> 4] == 0)
...@@ -2109,9 +2112,9 @@ sub dump_decompositions($@) ...@@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
# now count the 256-char subsets that contain something # now count the 256-char subsets that contain something
my @filled_idx = (256) x 256; my @filled_idx = ($level2) x $level2;
$pos = 256 + 16; $pos = $level2 + 16;
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
$filled_idx[$i >> 4] = $pos; $filled_idx[$i >> 4] = $pos;
...@@ -2123,7 +2126,7 @@ sub dump_decompositions($@) ...@@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
# add the index offsets to the subsets positions # add the index offsets to the subsets positions
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
$filled[$i] += $null_offset; $filled[$i] += $null_offset;
...@@ -2138,9 +2141,9 @@ sub dump_decompositions($@) ...@@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
# dump the second-level indexes # dump the second-level indexes
for (my $i = 0; $i < 256; $i++) for (my $i = 0; $i < $level2; $i++)
{ {
next unless ($filled_idx[$i] > 256); next unless ($filled_idx[$i] > $level2);
my @table = @filled[($i<<4)..($i<<4)+15]; my @table = @filled[($i<<4)..($i<<4)+15];
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; } for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i; printf OUTPUT ",\n /* sub-index %02x */\n", $i;
...@@ -2155,7 +2158,7 @@ sub dump_decompositions($@) ...@@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
$pos = $total; $pos = $total;
my @data; my @data;
for (my $i = 0; $i < 4096; $i++) for (my $i = 0; $i < $level1; $i++)
{ {
next unless $filled[$i]; next unless $filled[$i];
my @table = (0) x (16); my @table = (0) x (16);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment