Commit 4ebd9d85 authored by Marcus Meissner's avatar Marcus Meissner Committed by Alexandre Julliard

Speedup enhancement for the 8->16 and 8->32 copy loop hotspots,

using a faster generic routine and some inline assembly for i386 (cleanups by Ove).
parent eb2e77fd
...@@ -3404,15 +3404,39 @@ static HRESULT WINAPI DGA_IDirectDrawImpl_SetDisplayMode( ...@@ -3404,15 +3404,39 @@ static HRESULT WINAPI DGA_IDirectDrawImpl_SetDisplayMode(
static void pixel_convert_16_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) { static void pixel_convert_16_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) {
unsigned char *c_src = (unsigned char *) src; unsigned char *c_src = (unsigned char *) src;
unsigned short *c_dst = (unsigned short *) dst; unsigned short *c_dst = (unsigned short *) dst;
int x, y; int y;
if (palette != NULL) { if (palette != NULL) {
unsigned short *pal = (unsigned short *) palette->screen_palents; const unsigned short * pal = (unsigned short *) palette->screen_palents;
for (y = 0; y < height; y++) { for (y = height; y--; ) {
for (x = 0; x < width; x++) { #ifdef __i386__
c_dst[x + y * width] = pal[c_src[x + y * pitch]]; /* gcc generates slightly inefficient code for the the copy / lookup,
} * it generates one excess memory access (to pal) per pixel. Since
* we know that pal is not modified by the memory write we can
* put it into a register and reduce the number of memory accesses
* from 4 to 3 pp. There are two xor eax,eax to avoid pipeline stalls.
* (This is not guaranteed to be the fastest method.)
*/
__asm__ __volatile__(
"xor %%eax,%%eax\n"
"1:\n"
" lodsb\n"
" movw (%%edx,%%eax,2),%%ax\n"
" stosw\n"
" xor %%eax,%%eax\n"
" loop 1b\n"
: "=S" (c_src), "=D" (c_dst)
: "S" (c_src), "D" (c_dst) , "c" (width), "d" (pal)
: "eax", "cc", "memory"
);
c_src+=(pitch-width);
#else
unsigned char * srclineend = c_src+width;
while (c_src < srclineend)
*c_dst++ = pal[*c_src++];
c_src+=(pitch-width);
#endif
} }
} else { } else {
WARN(ddraw, "No palette set...\n"); WARN(ddraw, "No palette set...\n");
...@@ -3444,15 +3468,33 @@ static void palette_convert_15_to_8(LPPALETTEENTRY palent, void *screen_palette, ...@@ -3444,15 +3468,33 @@ static void palette_convert_15_to_8(LPPALETTEENTRY palent, void *screen_palette,
static void pixel_convert_32_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) { static void pixel_convert_32_to_8(void *src, void *dst, DWORD width, DWORD height, LONG pitch, IDirectDrawPaletteImpl* palette) {
unsigned char *c_src = (unsigned char *) src; unsigned char *c_src = (unsigned char *) src;
unsigned int *c_dst = (unsigned int *) dst; unsigned int *c_dst = (unsigned int *) dst;
int x, y; int y;
if (palette != NULL) { if (palette != NULL) {
unsigned int *pal = (unsigned int *) palette->screen_palents; const unsigned int *pal = (unsigned int *) palette->screen_palents;
for (y = 0; y < height; y++) { for (y = height; y--; ) {
for (x = 0; x < width; x++) { #ifdef __i386__
c_dst[x + y * width] = pal[c_src[x + y * pitch]]; /* See comment in pixel_convert_16_to_8 */
} __asm__ __volatile__(
"xor %%eax,%%eax\n"
"1:\n"
" lodsb\n"
" movl (%%edx,%%eax,4),%%eax\n"
" stosl\n"
" xor %%eax,%%eax\n"
" loop 1b\n"
: "=S" (c_src), "=D" (c_dst)
: "S" (c_src), "D" (c_dst) , "c" (width), "d" (pal)
: "eax", "cc", "memory"
);
c_src+=(pitch-width);
#else
unsigned char * srclineend = c_src+width;
while (c_src < srclineend )
*c_dst++ = pal[*c_src++];
c_src+=(pitch-width);
#endif
} }
} else { } else {
WARN(ddraw, "No palette set...\n"); WARN(ddraw, "No palette set...\n");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment