Commit e709cdba authored by Alexandre Julliard's avatar Alexandre Julliard

Added support for composite Unicode characters in MultiByteToWideChar

and WideCharToMultiByte.
parent 441f8745
......@@ -254,7 +254,6 @@ INT WINAPI MultiByteToWideChar( UINT page, DWORD flags, LPCSTR src, INT srclen,
if (srclen == -1) srclen = strlen(src) + 1;
if (flags & MB_COMPOSITE) FIXME("MB_COMPOSITE not supported\n");
if (flags & MB_USEGLYPHCHARS) FIXME("MB_USEGLYPHCHARS not supported\n");
switch(page)
......@@ -330,8 +329,6 @@ INT WINAPI WideCharToMultiByte( UINT page, DWORD flags, LPCWSTR src, INT srclen,
if (srclen == -1) srclen = strlenW(src) + 1;
/* if (flags & WC_COMPOSITECHECK) FIXME( "WC_COMPOSITECHECK (%lx) not supported\n", flags );*/
switch(page)
{
case CP_UTF7:
......
......@@ -70,6 +70,7 @@ CODEPAGES = \
C_SRCS = \
casemap.c \
compose.c \
cptable.c \
mbtowc.c \
string.c \
......
......@@ -166,6 +166,7 @@ $DEF_CHAR = ord '?';
READ_DEFAULTS();
DUMP_CASE_MAPPINGS();
DUMP_COMPOSE_TABLES();
DUMP_CTYPE_TABLES();
foreach $file (@allfiles) { HANDLE_FILE( @$file ); }
......@@ -185,6 +186,8 @@ sub READ_DEFAULTS
@toupper_table = ();
@category_table = ();
@direction_table = ();
@decomp_table = ();
@compose_table = ();
# first setup a few default mappings
......@@ -285,6 +288,12 @@ sub READ_DEFAULTS
# decomposition contains only char values without prefix -> use first char
$dst = hex $1;
$category_table[$src] |= $category_table[$dst];
# store decomposition if it contains two chars
if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
{
$decomp_table[$src] = [ hex $1, hex $2 ];
push @compose_table, [ hex $1, hex $2, $src ];
}
}
else
{
......@@ -465,7 +474,7 @@ sub DUMP_SBCS_TABLE
next unless defined $uni2cp[$i];
$filled[$i >> 8] = 1;
$subtables++;
$i = ($i & ~255) + 256;
$i |= 255;
}
# output all the subtables into a single array
......@@ -572,7 +581,7 @@ sub DUMP_DBCS_TABLE
next unless defined $uni2cp[$i];
$filled[$i >> 8] = 1;
$subtables++;
$i = ($i & ~255) + 256;
$i |= 255;
}
# output all the subtables into a single array
......@@ -669,7 +678,7 @@ sub DUMP_CASE_TABLE
next unless defined $table[$i];
$filled[$i >> 8] = $pos;
$pos += 256;
$i = ($i & ~255) + 256;
$i |= 255;
}
for ($i = 0; $i < 65536; $i++)
{
......@@ -737,6 +746,144 @@ sub DUMP_CTYPE_TABLES
close OUTPUT;
}
################################################################
# dump the char composition tables
sub DUMP_COMPOSE_TABLES
{
open OUTPUT,">compose.c" or die "Cannot create compose.c";
printf "Building compose.c\n";
printf OUTPUT "/* Unicode char composition */\n";
printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
printf OUTPUT "#include \"wine/unicode.h\"\n\n";
######### composition table
my @filled = ();
foreach $i (@compose_table)
{
my @comp = @$i;
push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
}
# count how many different second chars we have
for ($i = $count = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
$count++;
}
# build the table of second chars and offsets
my $pos = $count + 1;
for ($i = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
push @table, $i, $pos;
$pos += @{$filled[$i]};
}
# terminator with last position
push @table, 0, $pos;
printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
printf OUTPUT " /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );
# build the table of first chars and mappings
for ($i = 0; $i < 65536; $i++)
{
next unless defined $filled[$i];
my @table = ();
my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
for ($j = 0; $j <= $#list; $j++)
{
push @table, $list[$j][0], $list[$j][1];
}
printf OUTPUT ",\n /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
}
printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;
######### decomposition table
# first determine all the 16-char subsets that contain something
my @filled = (0) x 4096;
my $pos = 16*2; # for the null subset
for ($i = 0; $i < 65536; $i++)
{
next unless defined $decomp_table[$i];
$filled[$i >> 4] = $pos;
$pos += 16*2;
$i |= 15;
}
my $total = $pos;
# now count the 256-char subsets that contain something
my @filled_idx = (256) x 256;
$pos = 256 + 16;
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
$filled_idx[$i >> 4] = $pos;
$pos += 16;
$i |= 15;
}
my $null_offset = $pos; # null mapping
$total += $pos;
# add the index offsets to the subsets positions
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
$filled[$i] += $null_offset;
}
# dump the main index
printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
printf OUTPUT "{\n /* index */\n";
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
printf OUTPUT ",\n /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );
# dump the second-level indexes
for ($i = 0; $i < 256; $i++)
{
next unless ($filled_idx[$i] > 256);
my @table = @filled[($i<<4)..($i<<4)+15];
for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
printf OUTPUT ",\n /* sub-index %02x */\n", $i;
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
}
# dump the 16-char subsets
printf OUTPUT ",\n /* null mapping */\n";
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );
for ($i = 0; $i < 4096; $i++)
{
next unless $filled[$i];
my @table = (0) x 32;
for ($j = 0; $j < 16; $j++)
{
if (defined $decomp_table[($i<<4) + $j])
{
$table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
$table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
}
}
printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
}
printf OUTPUT "\n};\n";
close OUTPUT;
}
################################################################
# read an input file and generate the corresponding .c file
sub HANDLE_FILE
......
......@@ -9,6 +9,23 @@
#include "winnls.h"
#include "wine/unicode.h"
/* get the decomposition of a Unicode char */
static int get_decomposition( WCHAR src, WCHAR *dst, unsigned int dstlen )
{
extern const WCHAR unicode_decompose_table[];
const WCHAR *ptr = unicode_decompose_table;
int res;
*dst = src;
ptr = unicode_decompose_table + ptr[src >> 8];
ptr = unicode_decompose_table + ptr[(src >> 4) & 0x0f] + 2 * (src & 0x0f);
if (!*ptr) return 1;
if (dstlen <= 1) return 0;
/* apply the decomposition recursively to the first char */
if ((res = get_decomposition( *ptr, dst, dstlen-1 ))) dst[res++] = ptr[1];
return res;
}
/* check src string for invalid chars; return non-zero if invalid char found */
static inline int check_invalid_chars_sbcs( const struct sbcs_table *table,
const unsigned char *src, unsigned int srclen )
......@@ -70,6 +87,33 @@ static inline int mbstowcs_sbcs( const struct sbcs_table *table,
}
}
/* mbstowcs for single-byte code page with char decomposition */
static int mbstowcs_sbcs_decompose( const struct sbcs_table *table,
const unsigned char *src, unsigned int srclen,
WCHAR *dst, unsigned int dstlen )
{
const WCHAR * const cp2uni = table->cp2uni;
unsigned int len;
if (!dstlen) /* compute length */
{
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
for (len = 0; srclen; srclen--, src++)
len += get_decomposition( cp2uni[*src], dummy, 4 );
return len;
}
for (len = dstlen; srclen && len; srclen--, src++)
{
int res = get_decomposition( cp2uni[*src], dst, len );
if (!res) break;
len -= res;
dst += res;
}
if (srclen) return -1; /* overflow */
return dstlen - len;
}
/* query necessary dst length for src string */
static inline int get_length_dbcs( const struct dbcs_table *table,
const unsigned char *src, unsigned int srclen )
......@@ -122,7 +166,9 @@ static inline int mbstowcs_dbcs( const struct dbcs_table *table,
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
int len;
unsigned int len;
if (!dstlen) return get_length_dbcs( table, src, srclen );
for (len = dstlen; srclen && len; len--, srclen--, src++, dst++)
{
......@@ -140,6 +186,54 @@ static inline int mbstowcs_dbcs( const struct dbcs_table *table,
}
/* mbstowcs for double-byte code page with character decomposition */
static int mbstowcs_dbcs_decompose( const struct dbcs_table *table,
const unsigned char *src, unsigned int srclen,
WCHAR *dst, unsigned int dstlen )
{
const WCHAR * const cp2uni = table->cp2uni;
const unsigned char * const cp2uni_lb = table->cp2uni_leadbytes;
unsigned int len;
WCHAR ch;
int res;
if (!dstlen) /* compute length */
{
WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
for (len = 0; srclen; srclen--, src++)
{
unsigned char off = cp2uni_lb[*src];
if (off)
{
if (!--srclen) break; /* partial char, ignore it */
src++;
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
len += get_decomposition( ch, dummy, 4 );
}
return len;
}
for (len = dstlen; srclen && len; srclen--, src++)
{
unsigned char off = cp2uni_lb[*src];
if (off)
{
if (!--srclen) break; /* partial char, ignore it */
src++;
ch = cp2uni[(off << 8) + *src];
}
else ch = cp2uni[*src];
if (!(res = get_decomposition( ch, dst, len ))) break;
dst += res;
len -= res;
}
if (srclen) return -1; /* overflow */
return dstlen - len;
}
/* return -1 on dst buffer overflow, -2 on invalid input char */
int cp_mbstowcs( const union cptable *table, int flags,
const char *src, int srclen,
......@@ -151,8 +245,12 @@ int cp_mbstowcs( const union cptable *table, int flags,
{
if (check_invalid_chars_sbcs( &table->sbcs, src, srclen )) return -2;
}
if (!dstlen) return srclen;
return mbstowcs_sbcs( &table->sbcs, src, srclen, dst, dstlen );
if (!(flags & MB_COMPOSITE))
{
if (!dstlen) return srclen;
return mbstowcs_sbcs( &table->sbcs, src, srclen, dst, dstlen );
}
return mbstowcs_sbcs_decompose( &table->sbcs, src, srclen, dst, dstlen );
}
else /* mbcs */
{
......@@ -160,7 +258,9 @@ int cp_mbstowcs( const union cptable *table, int flags,
{
if (check_invalid_chars_dbcs( &table->dbcs, src, srclen )) return -2;
}
if (!dstlen) return get_length_dbcs( &table->dbcs, src, srclen );
return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
if (!(flags & MB_COMPOSITE))
return mbstowcs_dbcs( &table->dbcs, src, srclen, dst, dstlen );
else
return mbstowcs_dbcs_decompose( &table->dbcs, src, srclen, dst, dstlen );
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment