unicode: Add support for high Unicode planes in decomposition tables.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>

unicode: Add support for high Unicode planes in decomposition tables.
c6587319 · Alexandre Julliard · 148f564d · c6587319 · c6587319 · c6587319
Commit c6587319 authored Feb 17, 2020 by Alexandre Julliard
5 changed files
--- a/dlls/kernel32/tests/locale.c
+++ b/dlls/kernel32/tests/locale.c
@@ -6257,17 +6257,19 @@ static void test_NormalizeString(void)
        ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
        SetLastError( 0xdeadbeef );
        dstlen = pNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, ARRAY_SIZE(dst) );
-        todo_wine ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
+        ok( dstlen == -3, "%d: wrong len %d\n", i, dstlen );
-        todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
+        ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
        dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, NULL, 0 );
        ok( dstlen == (i < 2 ? 21 : 64), "%d: wrong len %d\n", i, dstlen );
        SetLastError( 0xdeadbeef );
        dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, ARRAY_SIZE(dst) );
-        todo_wine ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
+        ok( dstlen == -4, "%d: wrong len %d\n", i, dstlen );
-        todo_wine ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
+        ok( GetLastError() == ERROR_NO_UNICODE_TRANSLATION, "%d: wrong error %d\n", i, GetLastError() );
        SetLastError( 0xdeadbeef );
        dstlen = pNormalizeString( norm_forms[i], L"ABCD\xdc12Z", -1, dst, 2 );
-        todo_wine ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
+        todo_wine
+        ok( dstlen == (i < 2 ? -18 : -74), "%d: wrong len %d\n", i, dstlen );
+        todo_wine_if (i == 0 || i == 2)
        ok( GetLastError() == ERROR_INSUFFICIENT_BUFFER, "%d: wrong error %d\n", i, GetLastError() );
        if (pRtlNormalizeString)
        {
@@ -6277,17 +6279,18 @@ static void test_NormalizeString(void)
            ok( dstlen == (i < 2 ? 15 : 64), "%d: wrong len %d\n", i, dstlen );
            dstlen = ARRAY_SIZE(dst);
            status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
-            todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
+            ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
-            todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
+            ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
            dstlen = 1;
            status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
+            todo_wine_if( i == 0 || i == 2)
            ok( status == STATUS_BUFFER_TOO_SMALL, "%d: failed %x\n", i, status );
-            todo_wine_if (i != 3)
+            todo_wine_if( i != 3)
            ok( dstlen == (i < 2 ? 14 : 73), "%d: wrong len %d\n", i, dstlen );
            dstlen = 2;
            status = pRtlNormalizeString( norm_forms[i], L"AB\xd800Z", -1, dst, &dstlen );
-            todo_wine ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
+            ok( status == STATUS_NO_UNICODE_TRANSLATION, "%d: failed %x\n", i, status );
-            todo_wine ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
+            ok( dstlen == 3, "%d: wrong len %d\n", i, dstlen );
        }
    }

--- a/dlls/kernelbase/decompose.c
+++ b/dlls/kernelbase/decompose.c
--- a/dlls/ntdll/decompose.c
+++ b/dlls/ntdll/decompose.c
--- a/dlls/ntdll/locale.c
+++ b/dlls/ntdll/locale.c
@@ -155,14 +155,14 @@ static WCHAR casemap_ascii( WCHAR ch )
 }
-static const WCHAR *get_decomposition( const unsigned short *table, WCHAR ch, unsigned int *len )
+static const WCHAR *get_decomposition( const unsigned short *table, unsigned int ch, unsigned int *len )
 {
    unsigned short offset = table[table[ch >> 8] + ((ch >> 4) & 0xf)] + (ch & 0xf);
    unsigned short start = table[offset];
    unsigned short end = table[offset + 1];
    if ((*len = end - start)) return table + start;
-    *len = 1;
+    *len = 1 + (ch >= 0x10000);
    return NULL;
 }
@@ -174,13 +174,13 @@ static BYTE get_combining_class( unsigned int c )
 }
-static BOOL is_starter( WCHAR c )
+static BOOL is_starter( unsigned int c )
 {
    return !get_combining_class( c );
 }
-static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
+static BOOL reorderable_pair( unsigned int c1, unsigned int c2 )
 {
    BYTE ccc1, ccc2;
@@ -191,23 +191,52 @@ static BOOL reorderable_pair( WCHAR c1, WCHAR c2 )
    return ccc2 && (ccc1 > ccc2);
 }
+static int get_utf16( const WCHAR *src, unsigned int srclen, unsigned int *ch )
+{
+    if (IS_HIGH_SURROGATE( src[0] ))
+    {
+        if (srclen <= 1) return 0;
+        if (!IS_LOW_SURROGATE( src[1] )) return 0;
+        *ch = 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
+        return 2;
+    }
+    if (IS_LOW_SURROGATE( src[0] )) return 0;
+    *ch = src[0];
+    return 1;
+}
+static void put_utf16( WCHAR *dst, unsigned int ch )
+{
+    if (ch >= 0x10000)
+    {
+        ch -= 0x10000;
+        dst[0] = 0xd800 | (ch >> 10);
+        dst[1] = 0xdc00 | (ch & 0x3ff);
+    }
+    else dst[0] = ch;
+}
 static void canonical_order_substring( WCHAR *str, unsigned int len )
 {
-    unsigned int i;
+    unsigned int i, ch1, ch2, len1, len2;
    BOOL swapped;
    do
    {
        swapped = FALSE;
-        for (i = 0; i < len - 1; i++)
+        for (i = 0; i < len - 1; i += len1)
        {
-            if (reorderable_pair( str[i], str[i + 1] ))
+            if (!(len1 = get_utf16( str + i, len - i, &ch1 ))) break;
+            if (i + len1 >= len) break;
+            if (!(len2 = get_utf16( str + i + len1, len - i - len1, &ch2 ))) break;
+            if (reorderable_pair( ch1, ch2 ))
            {
-                WCHAR tmp = str[i];
+                WCHAR tmp[2];
-                str[i] = str[i + 1];
+                memcpy( tmp, str + i, len1 * sizeof(WCHAR) );
-                str[i + 1] = tmp;
+                memcpy( str + i, str + i + len1, len2 * sizeof(WCHAR) );
+                memcpy( str + i + len2, tmp, len1 * sizeof(WCHAR) );
                swapped = TRUE;
+                i += len2 - len1;
            }
        }
    } while (swapped);
@@ -224,38 +253,43 @@ static void canonical_order_substring( WCHAR *str, unsigned int len )
 */
 static void canonical_order_string( WCHAR *str, unsigned int len )
 {
-    unsigned int i, next = 0;
+    unsigned int ch, i, r, next = 0;
-    for (i = 1; i <= len; i++)
+    for (i = 0; i < len; i += r)
    {
-        if (i == len || is_starter( str[i] ))
+        if (!(r = get_utf16( str + i, len - i, &ch ))) return;
+        if (i && is_starter( ch ))
        {
            if (i > next + 1) /* at least two successive non-starters */
                canonical_order_substring( str + next, i - next );
-            next = i + 1;
+            next = i + r;
        }
    }
+    if (i > next + 1) canonical_order_substring( str + next, i - next );
 }
 static NTSTATUS decompose_string( int compat, const WCHAR *src, int src_len, WCHAR *dst, int *dst_len )
 {
    const unsigned short *table = compat ? nfkd_table : nfd_table;
-    int src_pos, dst_pos = 0;
+    int src_pos, dst_pos;
-    unsigned int decomp_len;
+    unsigned int ch, len, decomp_len;
    const WCHAR *decomp;
-    for (src_pos = 0; src_pos < src_len; src_pos++)
+    for (src_pos = dst_pos = 0; src_pos < src_len; src_pos += len, dst_pos += decomp_len)
    {
-        if (dst_pos == *dst_len) break;
+        if (!(len = get_utf16( src + src_pos, src_len - src_pos, &ch )) ||
-        if ((decomp = get_decomposition( table, src[src_pos], &decomp_len )))
+            (ch >= 0xfdd0 && ch <= 0xfdef) || ((ch & 0xffff) >= 0xfffe))
        {
-            if (dst_pos + decomp_len > *dst_len) break;
+            *dst_len = src_pos + IS_HIGH_SURROGATE( src[src_pos] );
-            memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
+            return STATUS_NO_UNICODE_TRANSLATION;
        }
-        else dst[dst_pos] = src[src_pos];
+        decomp = get_decomposition( table, ch, &decomp_len );
-        dst_pos += decomp_len;
+        if (dst_pos + decomp_len > *dst_len) break;
+        if (decomp) memcpy( dst + dst_pos, decomp, decomp_len * sizeof(WCHAR) );
+        else put_utf16( dst + dst_pos, ch );
    }
    if (src_pos < src_len)
    {
        *dst_len += (src_len - src_pos) * (compat ? 18 : 3);
@@ -1554,21 +1588,6 @@ NTSTATUS WINAPI RtlUTF8ToUnicodeN( WCHAR *dst, DWORD dstlen, DWORD *reslen, cons
 }
-/* get the next char value taking surrogates into account */
-static inline unsigned int get_surrogate_value( const WCHAR *src, unsigned int srclen )
-{
-    if (src[0] >= 0xd800 && src[0] <= 0xdfff)  /* surrogate pair */
-    {
-        if (src[0] > 0xdbff || /* invalid high surrogate */
-            srclen <= 1 ||     /* missing low surrogate */
-            src[1] < 0xdc00 || src[1] > 0xdfff) /* invalid low surrogate */
-            return 0;
-        return 0x10000 + ((src[0] & 0x3ff) << 10) + (src[1] & 0x3ff);
-    }
-    return src[0];
-}
 /**************************************************************************
 *	RtlUnicodeToUTF8N   (NTDLL.@)
 */
@@ -1592,7 +1611,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
            else if (*src < 0x800) len += 2;  /* 0x80-0x7ff: 2 bytes */
            else
            {
-                if (!(val = get_surrogate_value( src, srclen )))
+                if (!get_utf16( src, srclen, &val ))
                {
                    val = 0xfffd;
                    status = STATUS_SOME_NOT_MAPPED;
@@ -1629,7 +1648,7 @@ NTSTATUS WINAPI RtlUnicodeToUTF8N( char *dst, DWORD dstlen, DWORD *reslen, const
            dst += 2;
            continue;
        }
-        if (!(val = get_surrogate_value( src, srclen )))
+        if (!get_utf16( src, srclen, &val ))
        {
            val = 0xfffd;
            status = STATUS_SOME_NOT_MAPPED;

--- a/tools/make_unicode
+++ b/tools/make_unicode
@@ -480,7 +480,7 @@ sub build_decompositions(@)
    my @src = @_;
    my @dst;
-    for (my $i = 0; $i < 65536; $i++)
+    for (my $i = 0; $i < @src; $i++)
    {
        next unless defined $src[$i];
        my @decomp = get_decomposition( $i, \@src );
@@ -2092,10 +2092,13 @@ sub dump_decompositions($@)
    # first determine all the 16-char subsets that contain something
-    my @filled = (0) x 4096;
+    my $level1 = ($MAX_CHAR + 1) / 16;
+    my $level2 = $level1 / 16;
+    my @filled = (0) x $level1;
    my $pos = 16;  # for the null subset
    my $data_total = 0;
-    for (my $i = 0; $i < 65536; $i++)
+    for (my $i = 0; $i <= $MAX_CHAR; $i++)
    {
        next unless defined $decomp[$i];
        if ($filled[$i >> 4] == 0)
@@ -2109,9 +2112,9 @@ sub dump_decompositions($@)
    # now count the 256-char subsets that contain something
-    my @filled_idx = (256) x 256;
+    my @filled_idx = ($level2) x $level2;
-    $pos = 256 + 16;
+    $pos = $level2 + 16;
-    for (my $i = 0; $i < 4096; $i++)
+    for (my $i = 0; $i < $level1; $i++)
    {
        next unless $filled[$i];
        $filled_idx[$i >> 4] = $pos;
@@ -2123,7 +2126,7 @@ sub dump_decompositions($@)
    # add the index offsets to the subsets positions
-    for (my $i = 0; $i < 4096; $i++)
+    for (my $i = 0; $i < $level1; $i++)
    {
        next unless $filled[$i];
        $filled[$i] += $null_offset;
@@ -2138,9 +2141,9 @@ sub dump_decompositions($@)
    # dump the second-level indexes
-    for (my $i = 0; $i < 256; $i++)
+    for (my $i = 0; $i < $level2; $i++)
    {
-        next unless ($filled_idx[$i] > 256);
+        next unless ($filled_idx[$i] > $level2);
        my @table = @filled[($i<<4)..($i<<4)+15];
        for (my $j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
        printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
@@ -2155,7 +2158,7 @@ sub dump_decompositions($@)
    $pos = $total;
    my @data;
-    for (my $i = 0; $i < 4096; $i++)
+    for (my $i = 0; $i < $level1; $i++)
    {
        next unless $filled[$i];
        my @table = (0) x (16);