Commit c6587319 authored by Alexandre Julliard's avatar Alexandre Julliard

unicode: Add support for high Unicode planes in decomposition tables.

parent 148f564d
......@@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
SetLastError( 0xdeadbeef );
dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine
ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
todo_wine_if (i == 0 || i == 2)
ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
if (pRtlNormalizeString)
{
......@@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
dstlen = ARRAY_SIZE(dst);
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
dstlen = 1;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine_if( i == 0 || i == 2)
ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
todo_wine_if (i != 3)
todo_wine_if( i != 3)
ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
dstlen = 2;
status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
}
}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
}
static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len )
static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
{
unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
unsigned short start = table[offset];
unsigned short end = table[offset + 1];
if ((*len = end - start)) return table + start;
*len = 1;
*len = 1 + (ch >= 0x10000);
return NULL;
}
......@@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
}
static BOOL is_starter( WCHAR c )
static BOOL is_starter( unsigned int c )
{
return !get_combining_class( c );
}
static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
{
BYTE ccc1, ccc2;
......@@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
return ccc2 && (ccc1 > ccc2);
}
static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
{
if (IS_HIGH_SURROGATE( src[0] ))
{
if (srclen <= 1) return 0;
if (!IS_LOW_SURROGATE( src[1] )) return 0;
*ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
return 2;
}
if (IS_LOW_SURROGATE( src[0] )) return 0;
*ch = src[0];
return 1;
}
static void put_utf16( WCHAR *dst, unsigned int ch )
{
if (ch >= 0x10000)
{
ch -= 0x10000;
dst[0] = 0xd800 | (ch >> 10);
dst[1] = 0xdc00 | (ch & 0x3ff);
}
else dst[0] = ch;
}
static void canonical_order_substring( WCHAR *str, unsigned int len )
{
unsigned int i;
unsigned int i, ch1, ch2, len1, len2;
BOOL swapped;
do
{
swapped = FALSE;
for (i = 0; i < len - 1; i++)
for (i = 0; i < len - 1; i += len1)
{
if (reorderable_pair( str[i], str[i + 1] ))
if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
if (i + len1 >= len) break;
if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
if (reorderable_pair( ch1, ch2 ))
{
WCHAR tmp = str[i];
str[i] = str[i + 1];
str[i + 1] = tmp;
WCHAR tmp[2];
memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
swapped = TRUE;
i += len2 - len1;
}
}
} while (swapped);
......@@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
*/
static void canonical_order_string( WCHAR *str, unsigned int len )
{
unsigned int i, next = 0;
unsigned int ch, i, r, next = 0;
for (i = 1; i <= len; i++)
for (i = 0; i < len; i += r)
{
if (i == len || is_starter( str[i] ))
if (!(r = get_utf16( str + i, len - i, &ch ))) return;
if (i && is_starter( ch ))
{
if (i > next + 1) /* at least two successive non-starters */
canonical_order_substring( str + next, i - next );
next = i + 1;
next = i + r;
}
}
if (i > next + 1) canonical_order_substring( str + next, i - next );
}
static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
{
const unsigned short *table = compat ? nfkd_table : nfd_table;
int src_pos, dst_pos = 0;
unsigned int decomp_len;
int src_pos, dst_pos;
unsigned int ch, len, decomp_len;
const WCHAR *decomp;
for (src_pos = 0; src_pos < src_len; src_pos++)
for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
{
if (dst_pos == *dst_len) break;
if ((decomp = get_decomposition( table, src[src_pos], &decomp_len )))
if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
(ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
{
if (dst_pos + decomp_len > *dst_len) break;
memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
*dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
return STATUS_NO_UNICODE_TRANSLATION;
}
else dst[dst_pos] = src[src_pos];
dst_pos += decomp_len;
decomp = get_decomposition( table, ch, &decomp_len );
if (dst_pos + decomp_len > *dst_len) break;
if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
else put_utf16( dst + dst_pos, ch );
}
if (src_pos < src_len)
{
*dst_len += (src_len - src_pos) * (compat ? 18 : 3);
......@@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
}
/* get the next char value taking surrogates into account */
static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
{
if (src[0] >= 0xd800 && src[0] <= 0xdfff) /* surrogate pair */
{
if (src[0] > 0xdbff || /* invalid high surrogate */
srclen <= 1 || /* missing low surrogate */
src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
return 0;
return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
}
return src[0];
}
/**************************************************************************
* RtlUnicodeToUTF8N (NTDLL.@)
*/
......@@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
else if (*src < 0x800) len += 2; /* 0x80-0x7ff: 2 bytes */
else
{
if (!(val = get_surrogate_value( src, srclen )))
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
......@@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
dst += 2;
continue;
}
if (!(val = get_surrogate_value( src, srclen )))
if (!get_utf16( src, srclen, &val ))
{
val = 0xfffd;
status = STATUS_SOME_NOT_MAPPED;
......
......@@ -480,7 +480,7 @@ sub build_decompositions(@)
my @src = @_;
my @dst;
for (my $i = 0; $i < 65536; $i++)
for (my $i = 0; $i < @src; $i++)
{
next unless defined $src[$i];
my @decomp = get_decomposition( $i, \@src );
......@@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
# first determine all the 16-char subsets that contain something
my @filled = (0) x 4096;
my $level1 = ($MAX_CHAR + 1) / 16;
my $level2 = $level1 / 16;
my @filled = (0) x $level1;
my $pos = 16; # for the null subset
my $data_total = 0;
for (my $i = 0; $i < 65536; $i++)
for (my $i = 0; $i <= $MAX_CHAR; $i++)
{
next unless defined $decomp[$i];
if ($filled[$i >> 4] == 0)
......@@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
# now count the 256-char subsets that contain something
my @filled_idx = (256) x 256;
$pos = 256 + 16;
for (my $i = 0; $i < 4096; $i++)
my @filled_idx = ($level2) x $level2;
$pos = $level2 + 16;
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
$filled_idx[$i >> 4] = $pos;
......@@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
# add the index offsets to the subsets positions
for (my $i = 0; $i < 4096; $i++)
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
$filled[$i] += $null_offset;
......@@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
# dump the second-level indexes
for (my $i = 0; $i < 256; $i++)
for (my $i = 0; $i < $level2; $i++)
{
next unless ($filled_idx[$i] > 256);
next unless ($filled_idx[$i] > $level2);
my @table = @filled[($i<<4)..($i<<4)+15];
for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
......@@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
$pos = $total;
my @data;
for (my $i = 0; $i < 4096; $i++)
for (my $i = 0; $i < $level1; $i++)
{
next unless $filled[$i];
my @table = (0) x (16);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment