Commit c2f67149 authored by Alexandre Julliard's avatar Alexandre Julliard

wmc: Directly implement UTF-8 conversions.

parent 88abd7cb
...@@ -222,13 +222,11 @@ int is_valid_codepage(int id) ...@@ -222,13 +222,11 @@ int is_valid_codepage(int id)
int wmc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen ) int wmc_mbstowcs( int codepage, int flags, const char *src, int srclen, WCHAR *dst, int dstlen )
{ {
if (codepage == CP_UTF8) return wine_utf8_mbstowcs( flags, src, srclen, dst, dstlen );
return wine_cp_mbstowcs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen ); return wine_cp_mbstowcs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen );
} }
int wmc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen ) int wmc_wcstombs( int codepage, int flags, const WCHAR *src, int srclen, char *dst, int dstlen )
{ {
if (codepage == CP_UTF8) return wine_utf8_wcstombs( flags, src, srclen, dst, dstlen );
return wine_cp_wcstombs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen, NULL, NULL ); return wine_cp_wcstombs( wine_cp_get_table( codepage ), flags, src, srclen, dst, dstlen, NULL, NULL );
} }
......
...@@ -198,9 +198,18 @@ try_again: ...@@ -198,9 +198,18 @@ try_again:
xyyerror(err_fatalread); xyyerror(err_fatalread);
else if(!cptr) else if(!cptr)
return 0; return 0;
n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE); if (codepage == CP_UTF8)
if(n < 0) {
internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n); WCHAR *buf = utf8_to_unicode( xlatebuffer, strlen(xlatebuffer), &n );
memcpy( inputbuffer, buf, (n + 1) * sizeof(WCHAR) );
free( buf );
}
else
{
n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
if(n < 0)
internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n);
}
if(n <= 1) if(n <= 1)
goto try_again; /* Should not happen */ goto try_again; /* Should not happen */
n--; /* Strip added conversion '\0' from input length */ n--; /* Strip added conversion '\0' from input length */
......
...@@ -404,14 +404,6 @@ static char *get_message_context( char **msgid ) ...@@ -404,14 +404,6 @@ static char *get_message_context( char **msgid )
#ifdef HAVE_LIBGETTEXTPO #ifdef HAVE_LIBGETTEXTPO
static char *convert_string_utf8( const lanmsg_t *msg )
{
char *buffer = xmalloc( msg->len * 4 + 1 );
int len = wmc_wcstombs( CP_UTF8, 0, msg->msg, msg->len, buffer, msg->len * 4 );
buffer[len] = 0;
return buffer;
}
static po_message_t find_message( po_file_t po, const char *msgid, const char *msgctxt, static po_message_t find_message( po_file_t po, const char *msgid, const char *msgctxt,
po_message_iterator_t *iterator ) po_message_iterator_t *iterator )
{ {
...@@ -467,7 +459,8 @@ static void add_po_string( po_file_t po, const lanmsg_t *msgid, const lanmsg_t * ...@@ -467,7 +459,8 @@ static void add_po_string( po_file_t po, const lanmsg_t *msgid, const lanmsg_t *
if (msgstr) if (msgstr)
{ {
str_buffer = str = convert_string_utf8( msgstr ); int len;
str_buffer = str = unicode_to_utf8( msgstr->msg, msgstr->len, &len );
if (is_english( msgstr->lan )) get_message_context( &str ); if (is_english( msgstr->lan )) get_message_context( &str );
} }
if (!(msg = find_message( po, id, context, &iterator ))) if (!(msg = find_message( po, id, context, &iterator )))
...@@ -644,7 +637,6 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found ) ...@@ -644,7 +637,6 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found )
{ {
lanmsg_t *new; lanmsg_t *new;
const char *transl; const char *transl;
int res;
char *buffer, *msgid, *context; char *buffer, *msgid, *context;
if (str->len <= 1 || !(buffer = convert_msgid_ascii( str, 0 ))) return str; if (str->len <= 1 || !(buffer = convert_msgid_ascii( str, 0 ))) return str;
...@@ -658,11 +650,7 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found ) ...@@ -658,11 +650,7 @@ static lanmsg_t *translate_string( lanmsg_t *str, int lang, int *found )
new->cp = 0; /* FIXME */ new->cp = 0; /* FIXME */
new->file = str->file; new->file = str->file;
new->line = str->line; new->line = str->line;
new->len = wmc_mbstowcs( CP_UTF8, 0, transl, strlen(transl) + 1, NULL, 0 ); new->msg = utf8_to_unicode( transl, strlen(transl) + 1, &new->len );
new->msg = xmalloc( new->len * sizeof(WCHAR) );
res = wmc_mbstowcs( CP_UTF8, MB_ERR_INVALID_CHARS, transl, strlen(transl) + 1, new->msg, new->len );
if (res == -2)
error( "Invalid utf-8 character in string '%s'\n", transl );
free( buffer ); free( buffer );
return new; return new;
} }
......
...@@ -272,6 +272,127 @@ int unistrcmp(const WCHAR *s1, const WCHAR *s2) ...@@ -272,6 +272,127 @@ int unistrcmp(const WCHAR *s1, const WCHAR *s2)
return *s1 - *s2; return *s1 - *s2;
} }
WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen )
{
static const char utf8_length[128] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0-0xaf */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0-0xbf */
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xc0-0xcf */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0xd0-0xdf */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xe0-0xef */
3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0 /* 0xf0-0xff */
};
static const unsigned char utf8_mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
const char *srcend = src + srclen;
int len, res;
WCHAR *ret, *dst;
dst = ret = xmalloc( (srclen + 1) * sizeof(WCHAR) );
while (src < srcend)
{
unsigned char ch = *src++;
if (ch < 0x80) /* special fast case for 7-bit ASCII */
{
*dst++ = ch;
continue;
}
len = utf8_length[ch - 0x80];
if (len && src + len <= srcend)
{
res = ch & utf8_mask[len];
switch (len)
{
case 3:
if ((ch = *src ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
src++;
if (res < 0x10) break;
case 2:
if ((ch = *src ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
if (res >= 0x110000 >> 6) break;
src++;
if (res < 0x20) break;
if (res >= 0xd800 >> 6 && res <= 0xdfff >> 6) break;
case 1:
if ((ch = *src ^ 0x80) >= 0x40) break;
res = (res << 6) | ch;
src++;
if (res < 0x80) break;
if (res <= 0xffff) *dst++ = res;
else
{
res -= 0x10000;
*dst++ = 0xd800 | (res >> 10);
*dst++ = 0xdc00 | (res & 0x3ff);
}
continue;
}
}
*dst++ = 0xfffd;
}
*dst = 0;
*dstlen = dst - ret;
return ret;
}
char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen )
{
char *ret, *dst;
dst = ret = xmalloc( srclen * 3 + 1 );
for ( ; srclen; srclen--, src++)
{
unsigned int ch = *src;
if (ch < 0x80) /* 0x00-0x7f: 1 byte */
{
*dst++ = ch;
continue;
}
if (ch < 0x800) /* 0x80-0x7ff: 2 bytes */
{
dst[1] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[0] = 0xc0 | ch;
dst += 2;
continue;
}
if (ch >= 0xd800 && ch <= 0xdbff && srclen > 1 && src[1] >= 0xdc00 && src[1] <= 0xdfff)
{
/* 0x10000-0x10ffff: 4 bytes */
ch = 0x10000 + ((ch & 0x3ff) << 10) + (src[1] & 0x3ff);
dst[3] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[2] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[1] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[0] = 0xf0 | ch;
dst += 4;
src++;
srclen--;
continue;
}
if (ch >= 0xd800 && ch <= 0xdfff) ch = 0xfffd; /* invalid surrogate pair */
/* 0x800-0xffff: 3 bytes */
dst[2] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[1] = 0x80 | (ch & 0x3f);
ch >>= 6;
dst[0] = 0xe0 | ch;
dst += 3;
}
*dst = 0;
*dstlen = dst - ret;
return ret;
}
/******************************************************************* /*******************************************************************
* buffer management * buffer management
* *
......
...@@ -49,6 +49,8 @@ WCHAR *unistrcpy(WCHAR *dst, const WCHAR *src); ...@@ -49,6 +49,8 @@ WCHAR *unistrcpy(WCHAR *dst, const WCHAR *src);
int unistrlen(const WCHAR *s); int unistrlen(const WCHAR *s);
int unistricmp(const WCHAR *s1, const WCHAR *s2); int unistricmp(const WCHAR *s1, const WCHAR *s2);
int unistrcmp(const WCHAR *s1, const WCHAR *s2); int unistrcmp(const WCHAR *s1, const WCHAR *s2);
WCHAR *utf8_to_unicode( const char *src, int srclen, int *dstlen );
char *unicode_to_utf8( const WCHAR *src, int srclen, int *dstlen );
/* buffer management */ /* buffer management */
......
...@@ -94,17 +94,13 @@ static const char str_header[] = ...@@ -94,17 +94,13 @@ static const char str_header[] =
"\n" "\n"
; ;
static char *dup_u2c(int cp, const WCHAR *uc) static char *dup_u2c(const WCHAR *uc)
{ {
int len; int i;
char *cptr; char *cptr = xmalloc( unistrlen(uc)+1 );
if (!cp) cp = CP_UTF8; for (i = 0; *uc; i++, uc++) cptr[i] = (*uc <= 0xff) ? *uc : '_';
len = wmc_wcstombs(cp, 0, uc, unistrlen(uc)+1, NULL, 0); cptr[i] = 0;
cptr = xmalloc(len);
len = wmc_wcstombs(cp, 0, uc, unistrlen(uc)+1, cptr, len);
if (len < 0)
internal_error(__FILE__, __LINE__, "Buffer overflow? code %d\n", len);
return cptr; return cptr;
} }
...@@ -183,7 +179,7 @@ void write_h_file(const char *fname) ...@@ -183,7 +179,7 @@ void write_h_file(const char *fname)
{ {
if(ttab[i].type == tok_severity && ttab[i].alias) if(ttab[i].type == tok_severity && ttab[i].alias)
{ {
cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias); cptr = dup_u2c(ttab[i].alias);
fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token); fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token);
free(cptr); free(cptr);
} }
...@@ -195,7 +191,7 @@ void write_h_file(const char *fname) ...@@ -195,7 +191,7 @@ void write_h_file(const char *fname)
{ {
if(ttab[i].type == tok_facility && ttab[i].alias) if(ttab[i].type == tok_facility && ttab[i].alias)
{ {
cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias); cptr = dup_u2c(ttab[i].alias);
fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token); fprintf(fp, "#define %s\t0x%x\n", cptr, ttab[i].token);
free(cptr); free(cptr);
} }
...@@ -209,7 +205,7 @@ void write_h_file(const char *fname) ...@@ -209,7 +205,7 @@ void write_h_file(const char *fname)
switch(ndp->type) switch(ndp->type)
{ {
case nd_comment: case nd_comment:
cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.comment+1); cptr = dup_u2c(ndp->u.comment+1);
killnl(cptr, 0); killnl(cptr, 0);
killcomment(cptr); killcomment(cptr);
if(*cptr) if(*cptr)
...@@ -237,14 +233,14 @@ void write_h_file(const char *fname) ...@@ -237,14 +233,14 @@ void write_h_file(const char *fname)
fprintf(fp, "\n"); fprintf(fp, "\n");
} }
fprintf(fp, "/* MessageId : 0x%08x */\n", ndp->u.msg->realid); fprintf(fp, "/* MessageId : 0x%08x */\n", ndp->u.msg->realid);
cptr = dup_u2c(ndp->u.msg->msgs[idx_en]->cp, ndp->u.msg->msgs[idx_en]->msg); cptr = dup_u2c(ndp->u.msg->msgs[idx_en]->msg);
killnl(cptr, 0); killnl(cptr, 0);
killcomment(cptr); killcomment(cptr);
fprintf(fp, "/* Approximate msg: %s */\n", cptr); fprintf(fp, "/* Approximate msg: %s */\n", cptr);
free(cptr); free(cptr);
cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.msg->sym); cptr = dup_u2c(ndp->u.msg->sym);
if(ndp->u.msg->cast) if(ndp->u.msg->cast)
cast = dup_u2c(WMC_DEFAULT_CODEPAGE, ndp->u.msg->cast); cast = dup_u2c(ndp->u.msg->cast);
else else
cast = NULL; cast = NULL;
switch(ndp->u.msg->base) switch(ndp->u.msg->base)
...@@ -299,7 +295,7 @@ static void write_rcbin(FILE *fp) ...@@ -299,7 +295,7 @@ static void write_rcbin(FILE *fp)
if(ttab[i].type == tok_language && ttab[i].token == lbp->lan) if(ttab[i].type == tok_language && ttab[i].token == lbp->lan)
{ {
if(ttab[i].alias) if(ttab[i].alias)
cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias); cptr = dup_u2c(ttab[i].alias);
break; break;
} }
} }
...@@ -317,7 +313,7 @@ static char *make_string(WCHAR *uc, int len, int codepage) ...@@ -317,7 +313,7 @@ static char *make_string(WCHAR *uc, int len, int codepage)
int i; int i;
int b; int b;
if(!codepage) if (!codepage || codepage == CP_UTF8)
{ {
*cptr++ = ' '; *cptr++ = ' ';
*cptr++ = 'L'; *cptr++ = 'L';
...@@ -379,8 +375,10 @@ static char *make_string(WCHAR *uc, int len, int codepage) ...@@ -379,8 +375,10 @@ static char *make_string(WCHAR *uc, int len, int codepage)
else else
{ {
char *tmp, *cc; char *tmp, *cc;
int unilen = unistrlen(uc) + 1;
cc = tmp = dup_u2c(codepage, uc); cc = tmp = xmalloc( unilen * 2 );
wmc_wcstombs( codepage, 0, uc, unilen, cptr, unilen * 2 );
*cptr++ = ' '; *cptr++ = ' ';
*cptr++ = '"'; *cptr++ = '"';
for(i = b = 0; i < len; i++, cc++) for(i = b = 0; i < len; i++, cc++)
...@@ -539,7 +537,7 @@ void write_bin_files(void) ...@@ -539,7 +537,7 @@ void write_bin_files(void)
{ {
if (ttab[i].type == tok_language && ttab[i].token == lbp->lan) if (ttab[i].type == tok_language && ttab[i].token == lbp->lan)
{ {
if (ttab[i].alias) cptr = dup_u2c(WMC_DEFAULT_CODEPAGE, ttab[i].alias); if (ttab[i].alias) cptr = dup_u2c(ttab[i].alias);
break; break;
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment