Commit b7b44224 authored by Alexandre Julliard's avatar Alexandre Julliard

wmc: Reimplement input format detection to correctly handle Unicode BOM.

parent c2bd9dea
...@@ -160,14 +160,13 @@ void set_codepage(int cp) ...@@ -160,14 +160,13 @@ void set_codepage(int cp)
/* /*
* Input functions * Input functions
*/ */
#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
static int nungetstack = 0; static int nungetstack = 0;
static int allocungetstack = 0; static int allocungetstack = 0;
static char *ungetstack = NULL; static char *ungetstack = NULL;
static int ninputbuffer = 0; static int ninputbuffer = 0;
static WCHAR *inputbuffer = NULL; static WCHAR inputbuffer[INPUTBUFFER_SIZE];
static char *xlatebuffer = NULL;
#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
/* /*
* Fill the input buffer with *one* line of input. * Fill the input buffer with *one* line of input.
...@@ -179,141 +178,74 @@ static char *xlatebuffer = NULL; ...@@ -179,141 +178,74 @@ static char *xlatebuffer = NULL;
*/ */
static int fill_inputbuffer(void) static int fill_inputbuffer(void)
{ {
int n; static enum input_mode { INPUT_UNKNOWN, INPUT_ASCII, INPUT_UTF8, INPUT_UNICODE } mode;
static const char err_fatalread[] = "Fatal: reading input failed"; static int swapped;
static int endian = -1; static unsigned char utf8_bom[3] = { 0xef, 0xbb, 0xbf };
WCHAR *wbuf;
if(!inputbuffer) int i, pos = 0, len = 0;
{ char buffer[INPUTBUFFER_SIZE];
inputbuffer = xmalloc(INPUTBUFFER_SIZE*sizeof(WCHAR));
xlatebuffer = xmalloc(INPUTBUFFER_SIZE); if (mode == INPUT_UNKNOWN)
} {
len = fread( buffer, 1, 8, yyin );
try_again: wbuf = (WCHAR *)buffer;
if(!unicodein) if (len >= 3 && !memcmp( buffer, utf8_bom, 3 ))
{ {
char *cptr; mode = INPUT_UTF8;
cptr = fgets(xlatebuffer, INPUTBUFFER_SIZE, yyin); memmove( buffer, buffer + 3, len - 3 );
if(!cptr && ferror(yyin)) len -= 3;
xyyerror(err_fatalread); }
else if(!cptr) else if (len == 8)
return 0; {
if (codepage == CP_UTF8) if (wbuf[0] == 0xfeff || wbuf[0] == 0xfffe)
{ {
WCHAR *buf = utf8_to_unicode( xlatebuffer, strlen(xlatebuffer), &n ); mode = INPUT_UNICODE;
memcpy( inputbuffer, buf, (n + 1) * sizeof(WCHAR) ); pos = 1;
free( buf ); swapped = (wbuf[0] == 0xfffe);
} }
else else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0xff00))
{ {
n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE); mode = INPUT_UNICODE;
if(n < 0) }
internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n); else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0x00ff))
} {
if(n <= 1) mode = INPUT_UNICODE;
goto try_again; /* Should not happen */ swapped = 1;
n--; /* Strip added conversion '\0' from input length */ }
/* }
* FIXME:
* Detect UTF-8 in the first time we read some bytes by if (mode == INPUT_UNICODE)
* checking the special sequence "FE..." or something like {
* that. I need to check www.unicode.org for details. len = 4 - pos;
*/ memcpy( inputbuffer, wbuf + pos, len * sizeof(WCHAR) );
} }
else else if (mode == INPUT_UNKNOWN) mode = unicodein ? INPUT_UTF8 : INPUT_ASCII;
{ }
if(endian == -1)
{ switch (mode)
n = fread(inputbuffer, 1, 8, yyin); {
if(n != 8) case INPUT_ASCII:
{ if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
if(!n && ferror(yyin)) ninputbuffer = wmc_mbstowcs( codepage, 0, buffer, strlen(buffer), inputbuffer, INPUTBUFFER_SIZE );
xyyerror(err_fatalread); if (ninputbuffer < 0) internal_error(__FILE__, __LINE__, "Could not translate to unicode\n");
else return 1;
xyyerror("Fatal: file too short to determine byteorder (should never happen)\n"); case INPUT_UTF8:
} if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
if(isisochar(inputbuffer[0]) && wbuf = utf8_to_unicode( buffer, strlen(buffer), &ninputbuffer );
isisochar(inputbuffer[1]) && memcpy( inputbuffer, wbuf, ninputbuffer * sizeof(WCHAR) );
isisochar(inputbuffer[2]) && free( wbuf );
isisochar(inputbuffer[3])) return 1;
{ case INPUT_UNICODE:
#ifdef WORDS_BIGENDIAN len += fread( inputbuffer + len, sizeof(WCHAR), INPUTBUFFER_SIZE - len, yyin );
endian = WMC_BO_BIG; if (!len) break;
#else if (swapped) for (i = 0; i < len; i++) inputbuffer[i] = BYTESWAP_WORD( inputbuffer[i] );
endian = WMC_BO_LITTLE; ninputbuffer = len;
#endif return 1;
} case INPUT_UNKNOWN:
else if(isisochar(BYTESWAP_WORD(inputbuffer[0])) && break;
isisochar(BYTESWAP_WORD(inputbuffer[1])) && }
isisochar(BYTESWAP_WORD(inputbuffer[2])) && if (ferror(yyin)) xyyerror( "Fatal: reading input failed\n" );
isisochar(BYTESWAP_WORD(inputbuffer[3]))) return 0;
{
#ifdef WORDS_BIGENDIAN
endian = WMC_BO_LITTLE;
#else
endian = WMC_BO_BIG;
#endif
}
else
xyyerror("Fatal: cannot determine file's byteorder\n");
/* FIXME:
* Determine the file-endian with the leader-bytes
* "FF FE..."; can't remember the exact sequence.
*/
n /= 2;
#ifdef WORDS_BIGENDIAN
if(endian == WMC_BO_LITTLE)
#else
if(endian == WMC_BO_BIG)
#endif
{
inputbuffer[0] = BYTESWAP_WORD(inputbuffer[0]);
inputbuffer[1] = BYTESWAP_WORD(inputbuffer[1]);
inputbuffer[2] = BYTESWAP_WORD(inputbuffer[2]);
inputbuffer[3] = BYTESWAP_WORD(inputbuffer[3]);
}
}
else
{
int i;
n = 0;
for(i = 0; i < INPUTBUFFER_SIZE; i++)
{
int t;
t = fread(&inputbuffer[i], 2, 1, yyin);
if(!t && ferror(yyin))
xyyerror(err_fatalread);
else if(!t && n)
break;
n++;
#ifdef WORDS_BIGENDIAN
if(endian == WMC_BO_LITTLE)
#else
if(endian == WMC_BO_BIG)
#endif
{
if((inputbuffer[i] = BYTESWAP_WORD(inputbuffer[i])) == '\n')
break;
}
else
{
if(inputbuffer[i] == '\n')
break;
}
}
}
}
if(!n)
{
mcy_warning("Re-read line (input was or converted to zilch)\n");
goto try_again; /* Should not happen, but could be due to stdin reading and a signal */
}
ninputbuffer += n;
return 1;
} }
static int get_unichar(void) static int get_unichar(void)
...@@ -332,7 +264,7 @@ static int get_unichar(void) ...@@ -332,7 +264,7 @@ static int get_unichar(void)
} }
ninputbuffer--; ninputbuffer--;
return (int)(*b++ & 0xffff); return *b++;
} }
static void unget_unichar(int ch) static void unget_unichar(int ch)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment