arb_program_shader.c 308 KB
Newer Older
1 2 3 4 5 6
/*
 * Pixel and vertex shaders implementation using ARB_vertex_program
 * and ARB_fragment_program GL extensions.
 *
 * Copyright 2002-2003 Jason Edmeades
 * Copyright 2002-2003 Raphael Junqueira
7
 * Copyright 2004 Christian Costa
8 9 10
 * Copyright 2005 Oliver Stieber
 * Copyright 2006 Ivan Gyurdiev
 * Copyright 2006 Jason Green
11
 * Copyright 2006 Henri Verbeet
12
 * Copyright 2007-2008 Stefan Dösinger for CodeWeavers
13
 * Copyright 2009 Henri Verbeet for CodeWeavers
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
 */

#include "config.h"

#include <math.h>
#include <stdio.h>

#include "wined3d_private.h"

WINE_DEFAULT_DEBUG_CHANNEL(d3d_shader);
38
WINE_DECLARE_DEBUG_CHANNEL(d3d_constants);
39
WINE_DECLARE_DEBUG_CHANNEL(d3d_caps);
40
WINE_DECLARE_DEBUG_CHANNEL(d3d);
41

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
/* Extract a line. Note that this modifies the source string. */
static char *get_line(char **ptr)
{
    char *p, *q;

    p = *ptr;
    if (!(q = strstr(p, "\n")))
    {
        if (!*p) return NULL;
        *ptr += strlen(p);
        return p;
    }
    *q = '\0';
    *ptr = q + 1;

    return p;
}

static void shader_arb_dump_program_source(const char *source)
{
62
    ULONG source_size;
63 64 65 66 67 68
    char *ptr, *line, *tmp;

    source_size = strlen(source) + 1;
    tmp = HeapAlloc(GetProcessHeap(), 0, source_size);
    if (!tmp)
    {
69
        ERR("Failed to allocate %u bytes for shader source.\n", source_size);
70 71 72 73 74 75 76 77 78 79 80
        return;
    }
    memcpy(tmp, source, source_size);

    ptr = tmp;
    while ((line = get_line(&ptr))) FIXME("    %s\n", line);
    FIXME("\n");

    HeapFree(GetProcessHeap(), 0, tmp);
}

81 82 83 84 85 86
enum arb_helper_value
{
    ARB_ZERO,
    ARB_ONE,
    ARB_TWO,
    ARB_0001,
87
    ARB_EPS,
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

    ARB_VS_REL_OFFSET
};

static const char *arb_get_helper_value(enum wined3d_shader_type shader, enum arb_helper_value value)
{
    if (shader == WINED3D_SHADER_TYPE_GEOMETRY)
    {
        ERR("Geometry shaders are unsupported\n");
        return "bad";
    }

    if (shader == WINED3D_SHADER_TYPE_PIXEL)
    {
        switch (value)
        {
104 105
            case ARB_ZERO: return "ps_helper_const.x";
            case ARB_ONE: return "ps_helper_const.y";
106
            case ARB_TWO: return "coefmul.x";
107
            case ARB_0001: return "ps_helper_const.xxxy";
108
            case ARB_EPS: return "ps_helper_const.z";
109 110 111 112 113 114 115
            default: break;
        }
    }
    else
    {
        switch (value)
        {
116
            case ARB_ZERO: return "helper_const.x";
117
            case ARB_ONE: return "helper_const.y";
118
            case ARB_TWO: return "helper_const.z";
119
            case ARB_EPS: return "helper_const.w";
120
            case ARB_0001: return "helper_const.xxxy";
121
            case ARB_VS_REL_OFFSET: return "rel_addr_const.y";
122 123 124 125 126 127 128 129 130 131
        }
    }
    FIXME("Unmanaged %s shader helper constant requested: %u\n",
          shader == WINED3D_SHADER_TYPE_PIXEL ? "pixel" : "vertex", value);
    switch (value)
    {
        case ARB_ZERO: return "0.0";
        case ARB_ONE: return "1.0";
        case ARB_TWO: return "2.0";
        case ARB_0001: return "{0.0, 0.0, 0.0, 1.0}";
132
        case ARB_EPS: return "1e-8";
133 134 135 136
        default: return "bad";
    }
}

137
static inline BOOL ffp_clip_emul(const struct wined3d_state *state)
138
{
139
    return state->lowest_disabled_stage < 7;
140 141
}

142
/* ARB_program_shader private data */
143

144
struct control_frame
145
{
146
    struct                          list entry;
147 148 149 150 151 152 153
    enum
    {
        IF,
        IFC,
        LOOP,
        REP
    } type;
154 155
    BOOL                            muting;
    BOOL                            outer_loop;
156 157
    union
    {
158 159 160
        unsigned int                loop;
        unsigned int                ifc;
    } no;
161
    struct wined3d_shader_loop_control loop_control;
162
    BOOL                            had_else;
163 164
};

165 166 167 168 169 170 171 172 173 174
struct arb_ps_np2fixup_info
{
    struct ps_np2fixup_info         super;
    /* For ARB we need a offset value:
     * With both GLSL and ARB mode the NP2 fixup information (the texture dimensions) are stored in a
     * consecutive way (GLSL uses a uniform array). Since ARB doesn't know the notion of a "standalone"
     * array we need an offset to the index inside the program local parameter array. */
    UINT                            offset;
};

175 176 177
struct arb_ps_compile_args
{
    struct ps_compile_args          super;
178 179
    WORD                            bools;
    WORD                            clip;  /* only a boolean, use a WORD for alignment */
180
    unsigned char                   loop_ctrl[MAX_CONST_I][3];
181 182 183 184 185 186 187 188 189 190 191
};

struct stb_const_desc
{
    unsigned char           texunit;
    UINT                    const_num;
};

struct arb_ps_compiled_shader
{
    struct arb_ps_compile_args      args;
192
    struct arb_ps_np2fixup_info     np2fixup_info;
193 194 195
    struct stb_const_desc           bumpenvmatconst[MAX_TEXTURES];
    struct stb_const_desc           luminanceconst[MAX_TEXTURES];
    UINT                            int_consts[MAX_CONST_I];
196
    GLuint                          prgId;
197
    UINT                            ycorrection;
198 199
    unsigned char                   numbumpenvmatconsts;
    char                            num_int_consts;
200 201 202 203 204
};

struct arb_vs_compile_args
{
    struct vs_compile_args          super;
205 206 207 208 209
    union
    {
        struct
        {
            WORD                    bools;
210 211
            unsigned char           clip_texcoord;
            unsigned char           clipplane_mask;
212 213
        }                           boolclip;
        DWORD                       boolclip_compare;
214
    } clip;
215
    DWORD                           ps_signature;
216 217
    union
    {
218 219 220
        unsigned char               samplers[4];
        DWORD                       samplers_compare;
    } vertex;
221
    unsigned char                   loop_ctrl[MAX_CONST_I][3];
222 223 224 225 226 227 228 229
};

struct arb_vs_compiled_shader
{
    struct arb_vs_compile_args      args;
    GLuint                          prgId;
    UINT                            int_consts[MAX_CONST_I];
    char                            num_int_consts;
230
    char                            need_color_unclamp;
231
    UINT                            pos_fixup;
232 233
};

234 235 236 237 238 239 240 241
struct recorded_instruction
{
    struct wined3d_shader_instruction ins;
    struct list entry;
};

struct shader_arb_ctx_priv
{
242
    char addr_reg[20];
243 244
    enum
    {
245 246 247 248 249 250 251
        /* plain GL_ARB_vertex_program or GL_ARB_fragment_program */
        ARB,
        /* GL_NV_vertex_progam2_option or GL_NV_fragment_program_option */
        NV2,
        /* GL_NV_vertex_program3 or GL_NV_fragment_program2 */
        NV3
    } target_version;
252

253 254
    const struct arb_vs_compile_args    *cur_vs_args;
    const struct arb_ps_compile_args    *cur_ps_args;
255
    const struct arb_ps_compiled_shader *compiled_fprog;
256
    const struct arb_vs_compiled_shader *compiled_vprog;
257
    struct arb_ps_np2fixup_info         *cur_np2fixup_info;
258 259 260 261
    struct list                         control_frames;
    struct list                         record;
    BOOL                                recording;
    BOOL                                muted;
262
    unsigned int                        num_loops, loop_depth, num_ifcs;
263
    int                                 aL;
264

265
    unsigned int                        vs_clipplanes;
266 267
    BOOL                                footer_written;
    BOOL                                in_main_func;
268

269 270 271 272 273 274 275 276 277 278 279 280 281 282
    /* For 3.0 vertex shaders */
    const char                          *vs_output[MAX_REG_OUTPUT];
    /* For 2.x and earlier vertex shaders */
    const char                          *texcrd_output[8], *color_output[2], *fog_output;

    /* 3.0 pshader input for compatibility with fixed function */
    const char                          *ps_input[MAX_REG_INPUT];
};

struct ps_signature
{
    struct wined3d_shader_signature_element *sig;
    DWORD                               idx;
    struct wine_rb_entry                entry;
283 284
};

285 286 287
struct arb_pshader_private {
    struct arb_ps_compiled_shader   *gl_shaders;
    UINT                            num_gl_shaders, shader_array_size;
288
    DWORD                           input_signature_idx;
289
    DWORD                           clipplane_emulation;
290
    BOOL                            clamp_consts;
291 292
};

293 294 295
struct arb_vshader_private {
    struct arb_vs_compiled_shader   *gl_shaders;
    UINT                            num_gl_shaders, shader_array_size;
296
    UINT rel_offset;
297 298
};

299 300 301 302 303 304 305
struct shader_arb_priv
{
    GLuint                  current_vprogram_id;
    GLuint                  current_fprogram_id;
    const struct arb_ps_compiled_shader *compiled_fprog;
    const struct arb_vs_compiled_shader *compiled_vprog;
    GLuint                  depth_blt_vprogram_id;
306 307
    GLuint                  depth_blt_fprogram_id_full[tex_type_count];
    GLuint                  depth_blt_fprogram_id_masked[tex_type_count];
308 309
    BOOL                    use_arbfp_fixed_func;
    struct wine_rb_tree     fragment_shaders;
310
    BOOL                    last_ps_const_clamped;
311
    BOOL                    last_vs_color_unclamp;
312 313 314

    struct wine_rb_tree     signature_tree;
    DWORD ps_sig_number;
315 316 317 318

    unsigned int highest_dirty_ps_const, highest_dirty_vs_const;
    char *vshader_const_dirty, *pshader_const_dirty;
    const struct wined3d_context *last_context;
319 320
};

321
/* GL locking for state handlers is done by the caller. */
322
static BOOL need_rel_addr_const(const struct arb_vshader_private *shader_data,
323
        const struct wined3d_shader_reg_maps *reg_maps, const struct wined3d_gl_info *gl_info)
324
{
325
    if (shader_data->rel_offset) return TRUE;
326
    if (!reg_maps->usesmova) return FALSE;
327 328 329 330 331 332 333 334 335 336
    return !gl_info->supported[NV_VERTEX_PROGRAM2_OPTION];
}

/* Returns TRUE if result.clip from GL_NV_vertex_program2 should be used and FALSE otherwise */
static inline BOOL use_nv_clip(const struct wined3d_gl_info *gl_info)
{
    return gl_info->supported[NV_VERTEX_PROGRAM2_OPTION]
            && !(gl_info->quirks & WINED3D_QUIRK_NV_CLIP_BROKEN);
}

337
static BOOL need_helper_const(const struct arb_vshader_private *shader_data,
338
        const struct wined3d_shader_reg_maps *reg_maps, const struct wined3d_gl_info *gl_info)
339
{
340
    if (need_rel_addr_const(shader_data, reg_maps, gl_info)) return TRUE;
341 342 343 344
    if (!gl_info->supported[NV_VERTEX_PROGRAM]) return TRUE; /* Need to init colors. */
    if (gl_info->quirks & WINED3D_QUIRK_ARB_VS_OFFSET_LIMIT) return TRUE; /* Load the immval offset. */
    if (gl_info->quirks & WINED3D_QUIRK_SET_TEXCOORD_W) return TRUE; /* Have to init texcoords. */
    if (!use_nv_clip(gl_info)) return TRUE; /* Init the clip texcoord */
345
    if (reg_maps->usesnrm) return TRUE; /* 0.0 */
346
    if (reg_maps->usespow) return TRUE; /* EPS, 0.0 and 1.0 */
347
    if (reg_maps->fog) return TRUE; /* Clamping fog coord, 0.0 and 1.0 */
348 349 350
    return FALSE;
}

351
static unsigned int reserved_vs_const(const struct arb_vshader_private *shader_data,
352
        const struct wined3d_shader_reg_maps *reg_maps, const struct wined3d_gl_info *gl_info)
353 354 355 356
{
    unsigned int ret = 1;
    /* We use one PARAM for the pos fixup, and in some cases one to load
     * some immediate values into the shader. */
357 358
    if (need_helper_const(shader_data, reg_maps, gl_info)) ++ret;
    if (need_rel_addr_const(shader_data, reg_maps, gl_info)) ++ret;
359 360
    return ret;
}
361

362
/* Loads floating point constants into the currently set ARB_vertex/fragment_program.
363
 * When constant_list == NULL, it will load all the constants.
364
 *
365 366 367
 * @target_type should be either GL_VERTEX_PROGRAM_ARB (for vertex shaders)
 *  or GL_FRAGMENT_PROGRAM_ARB (for pixel shaders)
 */
368
/* GL locking is done by the caller */
369 370 371
static unsigned int shader_arb_load_constantsF(const struct wined3d_shader *shader,
        const struct wined3d_gl_info *gl_info, GLuint target_type, unsigned int max_constants,
        const float *constants, char *dirty_consts)
372
{
373
    struct wined3d_shader_lconst *lconst;
374
    DWORD i, j;
375
    unsigned int ret;
376

377 378
    if (TRACE_ON(d3d_constants))
    {
379 380 381
        for(i = 0; i < max_constants; i++) {
            if(!dirty_consts[i]) continue;
            TRACE_(d3d_constants)("Loading constants %i: %f, %f, %f, %f\n", i,
382 383
                        constants[i * 4 + 0], constants[i * 4 + 1],
                        constants[i * 4 + 2], constants[i * 4 + 3]);
384 385
        }
    }
386 387 388

    i = 0;

389
    /* In 1.X pixel shaders constants are implicitly clamped in the range [-1;1] */
390
    if (target_type == GL_FRAGMENT_PROGRAM_ARB && shader->reg_maps.shader_version.major == 1)
391
    {
392
        float lcl_const[4];
393 394 395 396
        /* ps 1.x supports only 8 constants, clamp only those. When switching between 1.x and higher
         * shaders, the first 8 constants are marked dirty for reload
         */
        for(; i < min(8, max_constants); i++) {
397 398 399 400
            if(!dirty_consts[i]) continue;
            dirty_consts[i] = 0;

            j = 4 * i;
401 402
            if (constants[j + 0] > 1.0f) lcl_const[0] = 1.0f;
            else if (constants[j + 0] < -1.0f) lcl_const[0] = -1.0f;
403 404
            else lcl_const[0] = constants[j + 0];

405 406
            if (constants[j + 1] > 1.0f) lcl_const[1] = 1.0f;
            else if (constants[j + 1] < -1.0f) lcl_const[1] = -1.0f;
407 408
            else lcl_const[1] = constants[j + 1];

409 410
            if (constants[j + 2] > 1.0f) lcl_const[2] = 1.0f;
            else if (constants[j + 2] < -1.0f) lcl_const[2] = -1.0f;
411 412
            else lcl_const[2] = constants[j + 2];

413 414
            if (constants[j + 3] > 1.0f) lcl_const[3] = 1.0f;
            else if (constants[j + 3] < -1.0f) lcl_const[3] = -1.0f;
415 416 417
            else lcl_const[3] = constants[j + 3];

            GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, lcl_const));
418
        }
419

420 421 422 423 424 425 426 427 428
        /* If further constants are dirty, reload them without clamping.
         *
         * The alternative is not to touch them, but then we cannot reset the dirty constant count
         * to zero. That's bad for apps that only use PS 1.x shaders, because in that case the code
         * above would always re-check the first 8 constants since max_constant remains at the init
         * value
         */
    }

429 430
    if (gl_info->supported[EXT_GPU_PROGRAM_PARAMETERS])
    {
431 432 433 434 435 436 437 438 439 440 441 442 443
        /* TODO: Benchmark if we're better of with finding the dirty constants ourselves,
         * or just reloading *all* constants at once
         *
        GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, i, max_constants, constants + (i * 4)));
         */
        for(; i < max_constants; i++) {
            if(!dirty_consts[i]) continue;

            /* Find the next block of dirty constants */
            dirty_consts[i] = 0;
            j = i;
            for(i++; (i < max_constants) && dirty_consts[i]; i++) {
                dirty_consts[i] = 0;
444
            }
445 446 447 448 449 450 451 452

            GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, j, i - j, constants + (j * 4)));
        }
    } else {
        for(; i < max_constants; i++) {
            if(dirty_consts[i]) {
                dirty_consts[i] = 0;
                GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, constants + (i * 4)));
453
            }
454
        }
455 456
    }
    checkGLcall("glProgramEnvParameter4fvARB()");
457 458

    /* Load immediate constants */
459 460 461 462
    if (shader->load_local_constsF)
    {
        if (TRACE_ON(d3d_shader))
        {
463
            LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
464
            {
465 466 467 468 469 470 471
                GLfloat* values = (GLfloat*)lconst->value;
                TRACE_(d3d_constants)("Loading local constants %i: %f, %f, %f, %f\n", lconst->idx,
                        values[0], values[1], values[2], values[3]);
            }
        }
        /* Immediate constants are clamped for 1.X shaders at loading times */
        ret = 0;
472
        LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
473
        {
474
            dirty_consts[lconst->idx] = 1; /* Dirtify so the non-immediate constant overwrites it next time */
475
            ret = max(ret, lconst->idx + 1);
476
            GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, lconst->idx, (GLfloat*)lconst->value));
477
        }
478 479 480 481
        checkGLcall("glProgramEnvParameter4fvARB()");
        return ret; /* The loaded immediate constants need reloading for the next shader */
    } else {
        return 0; /* No constants are dirty now */
482
    }
483 484
}

485 486 487
/**
 * Loads the texture dimensions for NP2 fixup into the currently set ARB_[vertex/fragment]_programs.
 */
488 489 490 491
static void shader_arb_load_np2fixup_constants(void *shader_priv,
        const struct wined3d_gl_info *gl_info, const struct wined3d_state *state)
{
    const struct shader_arb_priv * priv = shader_priv;
492

493 494
    /* NP2 texcoord fixup is (currently) only done for pixelshaders. */
    if (!use_ps(state)) return;
495 496 497 498 499 500 501

    if (priv->compiled_fprog && priv->compiled_fprog->np2fixup_info.super.active) {
        const struct arb_ps_np2fixup_info* const fixup = &priv->compiled_fprog->np2fixup_info;
        UINT i;
        WORD active = fixup->super.active;
        GLfloat np2fixup_constants[4 * MAX_FRAGMENT_SAMPLERS];

502 503
        for (i = 0; active; active >>= 1, ++i)
        {
504
            const struct wined3d_texture *tex = state->textures[i];
505
            const unsigned char idx = fixup->super.idx[i];
506
            GLfloat *tex_dim = &np2fixup_constants[(idx >> 1) * 4];
507 508 509 510 511 512 513 514

            if (!(active & 1)) continue;

            if (!tex) {
                FIXME("Nonexistent texture is flagged for NP2 texcoord fixup\n");
                continue;
            }

515 516 517 518 519 520 521 522 523
            if (idx % 2)
            {
                tex_dim[2] = tex->pow2_matrix[0];
                tex_dim[3] = tex->pow2_matrix[5];
            }
            else
            {
                tex_dim[0] = tex->pow2_matrix[0];
                tex_dim[1] = tex->pow2_matrix[5];
524 525 526 527 528 529 530 531
            }
        }

        for (i = 0; i < fixup->super.num_consts; ++i) {
            GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB,
                                                   fixup->offset + i, &np2fixup_constants[i * 4]));
        }
    }
532 533
}

534
/* GL locking is done by the caller. */
535
static void shader_arb_ps_local_constants(const struct arb_ps_compiled_shader *gl_shader,
536
        const struct wined3d_context *context, const struct wined3d_state *state, UINT rt_height)
537
{
538
    const struct wined3d_gl_info *gl_info = context->gl_info;
539 540
    unsigned char i;

541
    for(i = 0; i < gl_shader->numbumpenvmatconsts; i++)
542
    {
543 544
        int texunit = gl_shader->bumpenvmatconst[i].texunit;

545
        /* The state manager takes care that this function is always called if the bump env matrix changes */
546
        const float *data = (const float *)&state->texture_states[texunit][WINED3D_TSS_BUMPENV_MAT00];
547 548
        GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB,
                gl_shader->bumpenvmatconst[i].const_num, data));
549

550
        if (gl_shader->luminanceconst[i].const_num != WINED3D_CONST_NUM_UNUSED)
551
        {
552
            /* WINED3D_TSS_BUMPENVLSCALE and WINED3D_TSS_BUMPENVLOFFSET are next to each other.
553 554
             * point gl to the scale, and load 4 floats. x = scale, y = offset, z and w are junk, we
             * don't care about them. The pointers are valid for sure because the stateblock is bigger.
555
             * (they're WINED3D_TSS_TEXTURETRANSFORMFLAGS and WINED3D_TSS_ADDRESSW, so most likely 0 or NaN
556
            */
557
            const float *scale = (const float *)&state->texture_states[texunit][WINED3D_TSS_BUMPENV_LSCALE];
558 559
            GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB,
                    gl_shader->luminanceconst[i].const_num, scale));
560 561
        }
    }
562
    checkGLcall("Load bumpmap consts");
563

564 565 566 567 568 569 570 571
    if(gl_shader->ycorrection != WINED3D_CONST_NUM_UNUSED)
    {
        /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
        * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
        * ycorrection.z: 1.0
        * ycorrection.w: 0.0
        */
        float val[4];
572
        val[0] = context->render_offscreen ? 0.0f : (float) rt_height;
573
        val[1] = context->render_offscreen ? 1.0f : -1.0f;
574 575
        val[2] = 1.0f;
        val[3] = 0.0f;
576
        GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->ycorrection, val));
577
        checkGLcall("y correction loading");
578 579
    }

580
    if (!gl_shader->num_int_consts) return;
581 582 583 584 585 586

    for(i = 0; i < MAX_CONST_I; i++)
    {
        if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
        {
            float val[4];
587 588 589
            val[0] = (float)state->ps_consts_i[4 * i];
            val[1] = (float)state->ps_consts_i[4 * i + 1];
            val[2] = (float)state->ps_consts_i[4 * i + 2];
590
            val[3] = -1.0f;
591 592 593 594

            GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->int_consts[i], val));
        }
    }
595
    checkGLcall("Load ps int consts");
596 597
}

598
/* GL locking is done by the caller. */
599 600
static void shader_arb_vs_local_constants(const struct arb_vs_compiled_shader *gl_shader,
        const struct wined3d_context *context, const struct wined3d_state *state)
601
{
602
    const struct wined3d_gl_info *gl_info = context->gl_info;
603
    float position_fixup[4];
604
    unsigned char i;
605

606
    /* Upload the position fixup */
607 608
    shader_get_position_fixup(context, state, position_fixup);
    GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->pos_fixup, position_fixup));
609

610
    if (!gl_shader->num_int_consts) return;
611 612 613 614 615 616

    for(i = 0; i < MAX_CONST_I; i++)
    {
        if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
        {
            float val[4];
617 618 619
            val[0] = (float)state->vs_consts_i[4 * i];
            val[1] = (float)state->vs_consts_i[4 * i + 1];
            val[2] = (float)state->vs_consts_i[4 * i + 2];
620
            val[3] = -1.0f;
621 622 623 624

            GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->int_consts[i], val));
        }
    }
625
    checkGLcall("Load vs int consts");
626
}
627

628 629
/**
 * Loads the app-supplied constants into the currently set ARB_[vertex/fragment]_programs.
630 631
 *
 * We only support float constants in ARB at the moment, so don't
632 633
 * worry about the Integers or Booleans
 */
634
/* GL locking is done by the caller (state handler) */
635 636
static void shader_arb_load_constants(const struct wined3d_context *context, char usePixelShader, char useVertexShader)
{
637
    struct wined3d_device *device = context->swapchain->device;
638
    const struct wined3d_state *state = &device->stateBlock->state;
639
    const struct wined3d_gl_info *gl_info = context->gl_info;
640
    struct shader_arb_priv *priv = device->shader_priv;
641

642 643 644 645 646 647 648 649 650 651 652 653 654
    if (context != priv->last_context)
    {
        memset(priv->vshader_const_dirty, 1,
                sizeof(*priv->vshader_const_dirty) * device->d3d_vshader_constantF);
        priv->highest_dirty_vs_const = device->d3d_vshader_constantF;

        memset(priv->pshader_const_dirty, 1,
                sizeof(*priv->pshader_const_dirty) * device->d3d_pshader_constantF);
        priv->highest_dirty_ps_const = device->d3d_pshader_constantF;

        priv->last_context = context;
    }

655 656
    if (useVertexShader)
    {
657
        struct wined3d_shader *vshader = state->vertex_shader;
658
        const struct arb_vs_compiled_shader *gl_shader = priv->compiled_vprog;
659 660

        /* Load DirectX 9 float constants for vertex shader */
661 662
        priv->highest_dirty_vs_const = shader_arb_load_constantsF(vshader, gl_info, GL_VERTEX_PROGRAM_ARB,
                priv->highest_dirty_vs_const, state->vs_consts_f, priv->vshader_const_dirty);
663
        shader_arb_vs_local_constants(gl_shader, context, state);
664 665
    }

666 667
    if (usePixelShader)
    {
668
        struct wined3d_shader *pshader = state->pixel_shader;
669
        const struct arb_ps_compiled_shader *gl_shader = priv->compiled_fprog;
670
        UINT rt_height = state->fb->render_targets[0]->resource.height;
671

672
        /* Load DirectX 9 float constants for pixel shader */
673 674
        priv->highest_dirty_ps_const = shader_arb_load_constantsF(pshader, gl_info, GL_FRAGMENT_PROGRAM_ARB,
                priv->highest_dirty_ps_const, state->ps_consts_f, priv->pshader_const_dirty);
675
        shader_arb_ps_local_constants(gl_shader, context, state, rt_height);
676 677 678
    }
}

679
static void shader_arb_update_float_vertex_constants(struct wined3d_device *device, UINT start, UINT count)
680
{
681
    struct wined3d_context *context = context_get_current();
682
    struct shader_arb_priv *priv = device->shader_priv;
683 684 685

    /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
     * context. On a context switch the old context will be fully dirtified */
686
    if (!context || context->swapchain->device != device) return;
687

688 689
    memset(priv->vshader_const_dirty + start, 1, sizeof(*priv->vshader_const_dirty) * count);
    priv->highest_dirty_vs_const = max(priv->highest_dirty_vs_const, start + count);
690 691
}

692
static void shader_arb_update_float_pixel_constants(struct wined3d_device *device, UINT start, UINT count)
693
{
694
    struct wined3d_context *context = context_get_current();
695
    struct shader_arb_priv *priv = device->shader_priv;
696 697 698

    /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
     * context. On a context switch the old context will be fully dirtified */
699
    if (!context || context->swapchain->device != device) return;
700

701 702
    memset(priv->pshader_const_dirty + start, 1, sizeof(*priv->pshader_const_dirty) * count);
    priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, start + count);
703 704
}

705
static DWORD *local_const_mapping(const struct wined3d_shader *shader)
706
{
707
    const struct wined3d_shader_lconst *lconst;
708 709 710
    DWORD *ret;
    DWORD idx = 0;

711 712
    if (shader->load_local_constsF || list_empty(&shader->constantsF))
        return NULL;
713

714 715 716
    ret = HeapAlloc(GetProcessHeap(), 0, sizeof(DWORD) * shader->limits.constant_float);
    if (!ret)
    {
717 718 719 720
        ERR("Out of memory\n");
        return NULL;
    }

721
    LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
722
    {
723 724 725 726 727
        ret[lconst->idx] = idx++;
    }
    return ret;
}

728
/* Generate the variable & register declarations for the ARB_vertex_program output target */
729
static DWORD shader_generate_arb_declarations(const struct wined3d_shader *shader,
730
        const struct wined3d_shader_reg_maps *reg_maps, struct wined3d_shader_buffer *buffer,
731 732
        const struct wined3d_gl_info *gl_info, const DWORD *lconst_map,
        DWORD *num_clipplanes, const struct shader_arb_ctx_priv *ctx)
733
{
734
    DWORD i, next_local = 0;
735
    char pshader = shader_is_pshader_version(reg_maps->shader_version.type);
736
    const struct wined3d_shader_lconst *lconst;
737
    unsigned max_constantsF;
738
    DWORD map;
739

740 741 742 743 744 745 746
    /* In pixel shaders, all private constants are program local, we don't need anything
     * from program.env. Thus we can advertise the full set of constants in pixel shaders.
     * If we need a private constant the GL implementation will squeeze it in somewhere
     *
     * With vertex shaders we need the posFixup and on some GL implementations 4 helper
     * immediate values. The posFixup is loaded using program.env for now, so always
     * subtract one from the number of constants. If the shader uses indirect addressing,
747
     * account for the helper const too because we have to declare all available d3d constants
748 749
     * and don't know which are actually used.
     */
750 751
    if (pshader)
    {
752
        max_constantsF = gl_info->limits.arb_ps_native_constants;
753 754 755
        /* 24 is the minimum MAX_PROGRAM_ENV_PARAMETERS_ARB value. */
        if (max_constantsF < 24)
            max_constantsF = gl_info->limits.arb_ps_float_constants;
756 757 758
    }
    else
    {
759
        const struct arb_vshader_private *shader_data = shader->backend_data;
760 761 762 763 764 765 766
        max_constantsF = gl_info->limits.arb_vs_native_constants;
        /* 96 is the minimum MAX_PROGRAM_ENV_PARAMETERS_ARB value.
         * Also prevents max_constantsF from becoming less than 0 and
         * wrapping . */
        if (max_constantsF < 96)
            max_constantsF = gl_info->limits.arb_vs_float_constants;

767 768
        if (reg_maps->usesrelconstF)
        {
769
            DWORD highest_constf = 0, clip_limit;
770

771
            max_constantsF -= reserved_vs_const(shader_data, reg_maps, gl_info);
772
            max_constantsF -= count_bits(reg_maps->integer_constants);
773

774
            for (i = 0; i < shader->limits.constant_float; ++i)
775
            {
776 777 778 779
                DWORD idx = i >> 5;
                DWORD shift = i & 0x1f;
                if(reg_maps->constf[idx] & (1 << shift)) highest_constf = i;
            }
780

781 782
            if(use_nv_clip(gl_info) && ctx->target_version >= NV2)
            {
783 784 785 786
                if(ctx->cur_vs_args->super.clip_enabled)
                    clip_limit = gl_info->limits.clipplanes;
                else
                    clip_limit = 0;
787 788 789
            }
            else
            {
790
                unsigned int mask = ctx->cur_vs_args->clip.boolclip.clipplane_mask;
791 792
                clip_limit = min(count_bits(mask), 4);
            }
793 794 795 796
            *num_clipplanes = min(clip_limit, max_constantsF - highest_constf - 1);
            max_constantsF -= *num_clipplanes;
            if(*num_clipplanes < clip_limit)
            {
797
                WARN("Only %u clipplanes out of %u enabled\n", *num_clipplanes, gl_info->limits.clipplanes);
798
            }
799 800 801
        }
        else
        {
802 803
            if (ctx->target_version >= NV2) *num_clipplanes = gl_info->limits.clipplanes;
            else *num_clipplanes = min(gl_info->limits.clipplanes, 4);
804 805
        }
    }
806

807 808 809
    for (i = 0, map = reg_maps->temporary; map; map >>= 1, ++i)
    {
        if (map & 1) shader_addline(buffer, "TEMP R%u;\n", i);
810 811
    }

812 813 814
    for (i = 0, map = reg_maps->address; map; map >>= 1, ++i)
    {
        if (map & 1) shader_addline(buffer, "ADDRESS A%u;\n", i);
815 816
    }

817 818 819 820 821
    if (pshader && reg_maps->shader_version.major == 1 && reg_maps->shader_version.minor <= 3)
    {
        for (i = 0, map = reg_maps->texcoord; map; map >>= 1, ++i)
        {
            if (map & 1) shader_addline(buffer, "TEMP T%u;\n", i);
822
        }
823 824
    }

825 826 827
    /* Load local constants using the program-local space,
     * this avoids reloading them each time the shader is used
     */
828 829
    if (lconst_map)
    {
830
        LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
831
        {
832
            shader_addline(buffer, "PARAM C%u = program.local[%u];\n", lconst->idx,
833
                           lconst_map[lconst->idx]);
834
            next_local = max(next_local, lconst_map[lconst->idx] + 1);
835 836 837
        }
    }

838 839 840 841 842
    /* After subtracting privately used constants from the hardware limit(they are loaded as
     * local constants), make sure the shader doesn't violate the env constant limit
     */
    if(pshader)
    {
843
        max_constantsF = min(max_constantsF, gl_info->limits.arb_ps_float_constants);
844 845 846
    }
    else
    {
847
        max_constantsF = min(max_constantsF, gl_info->limits.arb_vs_float_constants);
848 849 850
    }

    /* Avoid declaring more constants than needed */
851
    max_constantsF = min(max_constantsF, shader->limits.constant_float);
852

853 854 855 856 857 858
    /* we use the array-based constants array if the local constants are marked for loading,
     * because then we use indirect addressing, or when the local constant list is empty,
     * because then we don't know if we're using indirect addressing or not. If we're hardcoding
     * local constants do not declare the loaded constants as an array because ARB compilers usually
     * do not optimize unused constants away
     */
859 860
    if (reg_maps->usesrelconstF)
    {
861 862 863 864 865
        /* Need to PARAM the environment parameters (constants) so we can use relative addressing */
        shader_addline(buffer, "PARAM C[%d] = { program.env[0..%d] };\n",
                    max_constantsF, max_constantsF - 1);
    } else {
        for(i = 0; i < max_constantsF; i++) {
866 867 868
            DWORD idx, mask;
            idx = i >> 5;
            mask = 1 << (i & 0x1f);
869
            if (!shader_constant_is_local(shader, i) && (reg_maps->constf[idx] & mask))
870
            {
871 872 873 874
                shader_addline(buffer, "PARAM C%d = program.env[%d];\n",i, i);
            }
        }
    }
875

876
    return next_local;
877 878
}

879
static const char * const shift_tab[] = {
880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897
    "dummy",     /*  0 (none) */
    "coefmul.x", /*  1 (x2)   */
    "coefmul.y", /*  2 (x4)   */
    "coefmul.z", /*  3 (x8)   */
    "coefmul.w", /*  4 (x16)  */
    "dummy",     /*  5 (x32)  */
    "dummy",     /*  6 (x64)  */
    "dummy",     /*  7 (x128) */
    "dummy",     /*  8 (d256) */
    "dummy",     /*  9 (d128) */
    "dummy",     /* 10 (d64)  */
    "dummy",     /* 11 (d32)  */
    "coefdiv.w", /* 12 (d16)  */
    "coefdiv.z", /* 13 (d8)   */
    "coefdiv.y", /* 14 (d4)   */
    "coefdiv.x"  /* 15 (d2)   */
};

898
static void shader_arb_get_write_mask(const struct wined3d_shader_instruction *ins,
899
        const struct wined3d_shader_dst_param *dst, char *write_mask)
900
{
901
    char *ptr = write_mask;
902

903
    if (dst->write_mask != WINED3DSP_WRITEMASK_ALL)
904
    {
905
        *ptr++ = '.';
906 907 908 909
        if (dst->write_mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
        if (dst->write_mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
        if (dst->write_mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
        if (dst->write_mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
910 911
    }

912
    *ptr = '\0';
913 914
}

915 916
static void shader_arb_get_swizzle(const struct wined3d_shader_src_param *param, BOOL fixup, char *swizzle_str)
{
917 918 919 920 921 922 923
    /* For registers of type WINED3DDECLTYPE_D3DCOLOR, data is stored as "bgra",
     * but addressed as "rgba". To fix this we need to swap the register's x
     * and z components. */
    const char *swizzle_chars = fixup ? "zyxw" : "xyzw";
    char *ptr = swizzle_str;

    /* swizzle bits fields: wwzzyyxx */
924
    DWORD swizzle = param->swizzle;
925 926 927 928 929 930 931
    DWORD swizzle_x = swizzle & 0x03;
    DWORD swizzle_y = (swizzle >> 2) & 0x03;
    DWORD swizzle_z = (swizzle >> 4) & 0x03;
    DWORD swizzle_w = (swizzle >> 6) & 0x03;

    /* If the swizzle is the default swizzle (ie, "xyzw"), we don't need to
     * generate a swizzle string. Unless we need to our own swizzling. */
932 933
    if (swizzle != WINED3DSP_NOSWIZZLE || fixup)
    {
934 935 936 937 938 939 940 941 942
        *ptr++ = '.';
        if (swizzle_x == swizzle_y && swizzle_x == swizzle_z && swizzle_x == swizzle_w) {
            *ptr++ = swizzle_chars[swizzle_x];
        } else {
            *ptr++ = swizzle_chars[swizzle_x];
            *ptr++ = swizzle_chars[swizzle_y];
            *ptr++ = swizzle_chars[swizzle_z];
            *ptr++ = swizzle_chars[swizzle_w];
        }
943
    }
944

945
    *ptr = '\0';
946 947
}

948 949 950
static void shader_arb_request_a0(const struct wined3d_shader_instruction *ins, const char *src)
{
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
951
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
952

953
    if (!strcmp(priv->addr_reg, src)) return;
954 955 956 957 958

    strcpy(priv->addr_reg, src);
    shader_addline(buffer, "ARL A0.x, %s;\n", src);
}

959 960 961 962
static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
        const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr);

static void shader_arb_get_register_name(const struct wined3d_shader_instruction *ins,
963
        const struct wined3d_shader_register *reg, char *register_name, BOOL *is_color)
964
{
965
    /* oPos, oFog and oPts in D3D */
966
    static const char * const rastout_reg_names[] = {"TMP_OUT", "TMP_FOGCOORD", "result.pointsize"};
967
    const struct wined3d_shader *shader = ins->ctx->shader;
968
    const struct wined3d_shader_reg_maps *reg_maps = ins->ctx->reg_maps;
969
    BOOL pshader = shader_is_pshader_version(reg_maps->shader_version.type);
970
    struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
971 972

    *is_color = FALSE;
973

974
    switch (reg->type)
975
    {
976
        case WINED3DSPR_TEMP:
977
            sprintf(register_name, "R%u", reg->idx);
978 979 980 981 982
            break;

        case WINED3DSPR_INPUT:
            if (pshader)
            {
983
                if (reg_maps->shader_version.major < 3)
984
                {
985
                    if (!reg->idx) strcpy(register_name, "fragment.color.primary");
986 987 988 989 990 991 992 993 994
                    else strcpy(register_name, "fragment.color.secondary");
                }
                else
                {
                    if(reg->rel_addr)
                    {
                        char rel_reg[50];
                        shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);

995
                        if (!strcmp(rel_reg, "**aL_emul**"))
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
                        {
                            DWORD idx = ctx->aL + reg->idx;
                            if(idx < MAX_REG_INPUT)
                            {
                                strcpy(register_name, ctx->ps_input[idx]);
                            }
                            else
                            {
                                ERR("Pixel shader input register out of bounds: %u\n", idx);
                                sprintf(register_name, "out_of_bounds_%u", idx);
                            }
                        }
1008
                        else if (reg_maps->input_registers & 0x0300)
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
                        {
                            /* There are two ways basically:
                             *
                             * 1) Use the unrolling code that is used for loop emulation and unroll the loop.
                             *    That means trouble if the loop also contains a breakc or if the control values
                             *    aren't local constants.
                             * 2) Generate an if block that checks if aL.y < 8, == 8 or == 9 and selects the
                             *    source dynamically. The trouble is that we cannot simply read aL.y because it
                             *    is an ADDRESS register. We could however push it, load .zw with a value and use
                             *    ADAC to load the condition code register and pop it again afterwards
                             */
                            FIXME("Relative input register addressing with more than 8 registers\n");

                            /* This is better than nothing for now */
                            sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
                        }
                        else if(ctx->cur_ps_args->super.vp_mode != vertexshader)
                        {
                            /* This is problematic because we'd have to consult the ctx->ps_input strings
                             * for where to find the varying. Some may be "0.0", others can be texcoords or
                             * colors. This needs either a pipeline replacement to make the vertex shader feed
                             * proper varyings, or loop unrolling
                             *
                             * For now use the texcoords and hope for the best
                             */
                            FIXME("Non-vertex shader varying input with indirect addressing\n");
                            sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
                        }
                        else
                        {
                            /* D3D supports indirect addressing only with aL in loop registers. The loop instruction
                             * pulls GL_NV_fragment_program2 in
                             */
                            sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
                        }
                    }
                    else
                    {
                        if(reg->idx < MAX_REG_INPUT)
                        {
                            strcpy(register_name, ctx->ps_input[reg->idx]);
                        }
                        else
                        {
                            ERR("Pixel shader input register out of bounds: %u\n", reg->idx);
                            sprintf(register_name, "out_of_bounds_%u", reg->idx);
                        }
                    }
                }
1058 1059 1060
            }
            else
            {
1061
                if (ctx->cur_vs_args->super.swizzle_map & (1 << reg->idx)) *is_color = TRUE;
1062
                sprintf(register_name, "vertex.attrib[%u]", reg->idx);
1063 1064 1065 1066
            }
            break;

        case WINED3DSPR_CONST:
1067
            if (!pshader && reg->rel_addr)
1068
            {
1069
                const struct arb_vshader_private *shader_data = shader->backend_data;
1070
                UINT rel_offset = shader_data->rel_offset;
1071
                BOOL aL = FALSE;
1072
                char rel_reg[50];
1073 1074
                if (reg_maps->shader_version.major < 2)
                {
1075 1076
                    sprintf(rel_reg, "A0.x");
                } else {
1077
                    shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
1078
                    if(ctx->target_version == ARB) {
1079 1080
                        if (!strcmp(rel_reg, "**aL_emul**"))
                        {
1081 1082 1083 1084 1085
                            aL = TRUE;
                        } else {
                            shader_arb_request_a0(ins, rel_reg);
                            sprintf(rel_reg, "A0.x");
                        }
1086
                    }
1087
                }
1088 1089 1090
                if(aL)
                    sprintf(register_name, "C[%u]", ctx->aL + reg->idx);
                else if (reg->idx >= rel_offset)
1091
                    sprintf(register_name, "C[%s + %u]", rel_reg, reg->idx - rel_offset);
1092
                else
1093
                    sprintf(register_name, "C[%s - %u]", rel_reg, rel_offset - reg->idx);
1094 1095 1096
            }
            else
            {
1097
                if (reg_maps->usesrelconstF)
1098
                    sprintf(register_name, "C[%u]", reg->idx);
1099
                else
1100
                    sprintf(register_name, "C%u", reg->idx);
1101 1102 1103 1104
            }
            break;

        case WINED3DSPR_TEXTURE: /* case WINED3DSPR_ADDR: */
1105 1106 1107 1108 1109
            if (pshader)
            {
                if (reg_maps->shader_version.major == 1
                        && reg_maps->shader_version.minor <= 3)
                {
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119
                    /* In ps <= 1.3, Tx is a temporary register as destination to all instructions,
                     * and as source to most instructions. For some instructions it is the texcoord
                     * input. Those instructions know about the special use
                     */
                    sprintf(register_name, "T%u", reg->idx);
                } else {
                    /* in ps 1.4 and 2.x Tx is always a (read-only) varying */
                    sprintf(register_name, "fragment.texcoord[%u]", reg->idx);
                }
            }
1120 1121
            else
            {
1122
                if (reg_maps->shader_version.major == 1 || ctx->target_version >= NV2)
1123 1124 1125 1126 1127 1128 1129 1130
                {
                    sprintf(register_name, "A%u", reg->idx);
                }
                else
                {
                    sprintf(register_name, "A%u_SHADOW", reg->idx);
                }
            }
1131 1132 1133
            break;

        case WINED3DSPR_COLOROUT:
1134
            if (ctx->cur_ps_args->super.srgb_correction && !reg->idx)
1135
            {
1136 1137 1138 1139 1140
                strcpy(register_name, "TMP_COLOR");
            }
            else
            {
                if(ctx->cur_ps_args->super.srgb_correction) FIXME("sRGB correction on higher render targets\n");
1141
                if (reg_maps->rt_mask > 1)
1142
                {
1143
                    sprintf(register_name, "result.color[%u]", reg->idx);
1144 1145 1146 1147 1148
                }
                else
                {
                    strcpy(register_name, "result.color");
                }
1149 1150 1151 1152
            }
            break;

        case WINED3DSPR_RASTOUT:
1153 1154
            if(reg->idx == 1) sprintf(register_name, "%s", ctx->fog_output);
            else sprintf(register_name, "%s", rastout_reg_names[reg->idx]);
1155 1156 1157 1158 1159 1160 1161
            break;

        case WINED3DSPR_DEPTHOUT:
            strcpy(register_name, "result.depth");
            break;

        case WINED3DSPR_ATTROUT:
1162
        /* case WINED3DSPR_OUTPUT: */
1163
            if (pshader) sprintf(register_name, "oD[%u]", reg->idx);
1164
            else strcpy(register_name, ctx->color_output[reg->idx]);
1165 1166 1167
            break;

        case WINED3DSPR_TEXCRDOUT:
1168 1169 1170 1171 1172 1173
            if (pshader)
            {
                sprintf(register_name, "oT[%u]", reg->idx);
            }
            else
            {
1174
                if (reg_maps->shader_version.major < 3)
1175
                {
1176
                    strcpy(register_name, ctx->texcrd_output[reg->idx]);
1177 1178 1179
                }
                else
                {
1180
                    strcpy(register_name, ctx->vs_output[reg->idx]);
1181 1182
                }
            }
1183 1184
            break;

1185
        case WINED3DSPR_LOOP:
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
            if(ctx->target_version >= NV2)
            {
                /* Pshader has an implicitly declared loop index counter A0.x that cannot be renamed */
                if(pshader) sprintf(register_name, "A0.x");
                else sprintf(register_name, "aL.y");
            }
            else
            {
                /* Unfortunately this code cannot return the value of ctx->aL here. An immediate value
                 * would be valid, but if aL is used for indexing(its only use), there's likely an offset,
                 * thus the result would be something like C[15 + 30], which is not valid in the ARB program
                 * grammar. So return a marker for the emulated aL and intercept it in constant and varying
                 * indexing
                 */
                sprintf(register_name, "**aL_emul**");
            }

1203 1204 1205 1206 1207 1208
            break;

        case WINED3DSPR_CONSTINT:
            sprintf(register_name, "I%u", reg->idx);
            break;

1209
        case WINED3DSPR_MISCTYPE:
1210
            if (!reg->idx)
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
            {
                sprintf(register_name, "vpos");
            }
            else if(reg->idx == 1)
            {
                sprintf(register_name, "fragment.facing.x");
            }
            else
            {
                FIXME("Unknown MISCTYPE register index %u\n", reg->idx);
            }
            break;

1224
        default:
1225 1226
            FIXME("Unhandled register type %#x[%u]\n", reg->type, reg->idx);
            sprintf(register_name, "unrecognized_register[%u]", reg->idx);
1227
            break;
1228 1229 1230
    }
}

1231
static void shader_arb_get_dst_param(const struct wined3d_shader_instruction *ins,
1232 1233 1234 1235 1236 1237
        const struct wined3d_shader_dst_param *wined3d_dst, char *str)
{
    char register_name[255];
    char write_mask[6];
    BOOL is_color;

1238
    shader_arb_get_register_name(ins, &wined3d_dst->reg, register_name, &is_color);
1239
    strcpy(str, register_name);
1240

1241
    shader_arb_get_write_mask(ins, wined3d_dst, write_mask);
1242
    strcat(str, write_mask);
1243 1244
}

1245
static const char *shader_arb_get_fixup_swizzle(enum fixup_channel_source channel_source)
1246
{
1247 1248 1249 1250 1251 1252 1253 1254
    switch(channel_source)
    {
        case CHANNEL_SOURCE_ZERO: return "0";
        case CHANNEL_SOURCE_ONE: return "1";
        case CHANNEL_SOURCE_X: return "x";
        case CHANNEL_SOURCE_Y: return "y";
        case CHANNEL_SOURCE_Z: return "z";
        case CHANNEL_SOURCE_W: return "w";
1255
        default:
1256 1257
            FIXME("Unhandled channel source %#x\n", channel_source);
            return "undefined";
1258
    }
1259 1260
}

1261 1262
static void gen_color_correction(struct wined3d_shader_buffer *buffer, const char *reg,
        DWORD dst_mask, const char *one, const char *two, struct color_fixup_desc fixup)
1263
{
1264
    DWORD mask;
1265

1266
    if (is_complex_fixup(fixup))
1267
    {
1268 1269
        enum complex_fixup complex_fixup = get_complex_fixup(fixup);
        FIXME("Complex fixup (%#x) not supported\n", complex_fixup);
1270 1271
        return;
    }
1272

1273 1274 1275 1276 1277 1278
    mask = 0;
    if (fixup.x_source != CHANNEL_SOURCE_X) mask |= WINED3DSP_WRITEMASK_0;
    if (fixup.y_source != CHANNEL_SOURCE_Y) mask |= WINED3DSP_WRITEMASK_1;
    if (fixup.z_source != CHANNEL_SOURCE_Z) mask |= WINED3DSP_WRITEMASK_2;
    if (fixup.w_source != CHANNEL_SOURCE_W) mask |= WINED3DSP_WRITEMASK_3;
    mask &= dst_mask;
1279

1280 1281 1282 1283 1284 1285
    if (mask)
    {
        shader_addline(buffer, "SWZ %s, %s, %s, %s, %s, %s;\n", reg, reg,
                shader_arb_get_fixup_swizzle(fixup.x_source), shader_arb_get_fixup_swizzle(fixup.y_source),
                shader_arb_get_fixup_swizzle(fixup.z_source), shader_arb_get_fixup_swizzle(fixup.w_source));
    }
1286

1287 1288 1289 1290 1291 1292
    mask = 0;
    if (fixup.x_sign_fixup) mask |= WINED3DSP_WRITEMASK_0;
    if (fixup.y_sign_fixup) mask |= WINED3DSP_WRITEMASK_1;
    if (fixup.z_sign_fixup) mask |= WINED3DSP_WRITEMASK_2;
    if (fixup.w_sign_fixup) mask |= WINED3DSP_WRITEMASK_3;
    mask &= dst_mask;
1293

1294 1295 1296 1297
    if (mask)
    {
        char reg_mask[6];
        char *ptr = reg_mask;
1298

1299 1300 1301 1302 1303 1304 1305
        if (mask != WINED3DSP_WRITEMASK_ALL)
        {
            *ptr++ = '.';
            if (mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
            if (mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
            if (mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
            if (mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
1306
        }
1307 1308 1309
        *ptr = '\0';

        shader_addline(buffer, "MAD %s%s, %s, %s, -%s;\n", reg, reg_mask, reg, two, one);
1310
    }
1311
}
1312

1313 1314 1315
static const char *shader_arb_get_modifier(const struct wined3d_shader_instruction *ins)
{
    DWORD mod;
1316
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1317 1318 1319
    if (!ins->dst_count) return "";

    mod = ins->dst[0].modifiers;
1320 1321 1322 1323

    /* Silently ignore PARTIALPRECISION if its not supported */
    if(priv->target_version == ARB) mod &= ~WINED3DSPDM_PARTIALPRECISION;

1324 1325 1326 1327 1328
    if(mod & WINED3DSPDM_MSAMPCENTROID)
    {
        FIXME("Unhandled modifier WINED3DSPDM_MSAMPCENTROID\n");
        mod &= ~WINED3DSPDM_MSAMPCENTROID;
    }
1329 1330

    switch(mod)
1331
    {
1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
        case WINED3DSPDM_SATURATE | WINED3DSPDM_PARTIALPRECISION:
            return "H_SAT";

        case WINED3DSPDM_SATURATE:
            return "_SAT";

        case WINED3DSPDM_PARTIALPRECISION:
            return "H";

        case 0:
            return "";

        default:
            FIXME("Unknown modifiers 0x%08x\n", mod);
            return "";
1347 1348 1349
    }
}

1350 1351 1352 1353 1354
#define TEX_PROJ        0x1
#define TEX_BIAS        0x2
#define TEX_LOD         0x4
#define TEX_DERIV       0x10

1355
static void shader_hw_sample(const struct wined3d_shader_instruction *ins, DWORD sampler_idx,
1356
        const char *dst_str, const char *coord_reg, WORD flags, const char *dsx, const char *dsy)
1357
{
1358
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1359
    DWORD sampler_type = ins->ctx->reg_maps->sampler_type[sampler_idx];
1360
    const struct wined3d_shader *shader = ins->ctx->shader;
1361
    const struct wined3d_texture *texture;
1362
    const char *tex_type;
1363
    BOOL np2_fixup = FALSE;
1364
    struct wined3d_device *device = shader->device;
1365
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1366
    const char *mod;
1367 1368 1369 1370
    BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);

    /* D3D vertex shader sampler IDs are vertex samplers(0-3), not global d3d samplers */
    if(!pshader) sampler_idx += MAX_FRAGMENT_SAMPLERS;
1371 1372 1373 1374 1375 1376 1377

    switch(sampler_type) {
        case WINED3DSTT_1D:
            tex_type = "1D";
            break;

        case WINED3DSTT_2D:
1378
            texture = device->stateBlock->state.textures[sampler_idx];
1379
            if (texture && texture->target == GL_TEXTURE_RECTANGLE_ARB)
1380
            {
1381 1382 1383 1384
                tex_type = "RECT";
            } else {
                tex_type = "2D";
            }
1385
            if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
1386
            {
1387
                if (priv->cur_np2fixup_info->super.active & (1 << sampler_idx))
1388
                {
1389 1390
                    if (flags) FIXME("Only ordinary sampling from NP2 textures is supported.\n");
                    else np2_fixup = TRUE;
1391
                }
1392
            }
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
            break;

        case WINED3DSTT_VOLUME:
            tex_type = "3D";
            break;

        case WINED3DSTT_CUBE:
            tex_type = "CUBE";
            break;

        default:
            ERR("Unexpected texture type %d\n", sampler_type);
            tex_type = "";
    }

1408 1409 1410 1411 1412 1413
    /* TEX, TXL, TXD and TXP do not support the "H" modifier,
     * so don't use shader_arb_get_modifier
     */
    if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) mod = "_SAT";
    else mod = "";

1414 1415 1416
    /* Fragment samplers always have indentity mapping */
    if(sampler_idx >= MAX_FRAGMENT_SAMPLERS)
    {
1417
        sampler_idx = priv->cur_vs_args->vertex.samplers[sampler_idx - MAX_FRAGMENT_SAMPLERS];
1418 1419
    }

1420 1421
    if (flags & TEX_DERIV)
    {
1422 1423
        if(flags & TEX_PROJ) FIXME("Projected texture sampling with custom derivatives\n");
        if(flags & TEX_BIAS) FIXME("Biased texture sampling with custom derivatives\n");
1424 1425
        shader_addline(buffer, "TXD%s %s, %s, %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
                       dsx, dsy,sampler_idx, tex_type);
1426
    }
1427 1428 1429 1430
    else if(flags & TEX_LOD)
    {
        if(flags & TEX_PROJ) FIXME("Projected texture sampling with explicit lod\n");
        if(flags & TEX_BIAS) FIXME("Biased texture sampling with explicit lod\n");
1431 1432
        shader_addline(buffer, "TXL%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
                       sampler_idx, tex_type);
1433
    }
1434
    else if (flags & TEX_BIAS)
1435
    {
1436
        /* Shouldn't be possible, but let's check for it */
1437
        if(flags & TEX_PROJ) FIXME("Biased and Projected texture sampling\n");
1438
        /* TXB takes the 4th component of the source vector automatically, as d3d. Nothing more to do */
1439
        shader_addline(buffer, "TXB%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1440 1441 1442
    }
    else if (flags & TEX_PROJ)
    {
1443
        shader_addline(buffer, "TXP%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1444 1445 1446
    }
    else
    {
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456
        if (np2_fixup)
        {
            const unsigned char idx = priv->cur_np2fixup_info->super.idx[sampler_idx];
            shader_addline(buffer, "MUL TA, np2fixup[%u].%s, %s;\n", idx >> 1,
                           (idx % 2) ? "zwxy" : "xyzw", coord_reg);

            shader_addline(buffer, "TEX%s %s, TA, texture[%u], %s;\n", mod, dst_str, sampler_idx, tex_type);
        }
        else
            shader_addline(buffer, "TEX%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1457 1458
    }

1459
    if (pshader)
1460
    {
1461
        gen_color_correction(buffer, dst_str, ins->dst[0].write_mask,
1462 1463 1464
                arb_get_helper_value(WINED3D_SHADER_TYPE_PIXEL, ARB_ONE),
                arb_get_helper_value(WINED3D_SHADER_TYPE_PIXEL, ARB_TWO),
                priv->cur_ps_args->super.color_fixup[sampler_idx]);
1465
    }
1466 1467
}

1468
static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
1469 1470
        const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr)
{
1471
    /* Generate a line that does the input modifier computation and return the input register to use */
1472
    BOOL is_color = FALSE;
1473 1474 1475
    char regstr[256];
    char swzstr[20];
    int insert_line;
1476
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1477
    struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1478 1479
    const char *one = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ONE);
    const char *two = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_TWO);
1480 1481 1482 1483 1484

    /* Assume a new line will be added */
    insert_line = 1;

    /* Get register name */
1485
    shader_arb_get_register_name(ins, &src->reg, regstr, &is_color);
1486
    shader_arb_get_swizzle(src, is_color, swzstr);
1487

1488 1489
    switch (src->modifiers)
    {
1490
    case WINED3DSPSM_NONE:
1491 1492 1493
        sprintf(outregstr, "%s%s", regstr, swzstr);
        insert_line = 0;
        break;
1494
    case WINED3DSPSM_NEG:
1495 1496 1497
        sprintf(outregstr, "-%s%s", regstr, swzstr);
        insert_line = 0;
        break;
1498
    case WINED3DSPSM_BIAS:
1499 1500
        shader_addline(buffer, "ADD T%c, %s, -coefdiv.x;\n", 'A' + tmpreg, regstr);
        break;
1501
    case WINED3DSPSM_BIASNEG:
1502 1503
        shader_addline(buffer, "ADD T%c, -%s, coefdiv.x;\n", 'A' + tmpreg, regstr);
        break;
1504
    case WINED3DSPSM_SIGN:
1505
        shader_addline(buffer, "MAD T%c, %s, %s, -%s;\n", 'A' + tmpreg, regstr, two, one);
1506
        break;
1507
    case WINED3DSPSM_SIGNNEG:
1508
        shader_addline(buffer, "MAD T%c, %s, -%s, %s;\n", 'A' + tmpreg, regstr, two, one);
1509
        break;
1510
    case WINED3DSPSM_COMP:
1511
        shader_addline(buffer, "SUB T%c, %s, %s;\n", 'A' + tmpreg, one, regstr);
1512
        break;
1513
    case WINED3DSPSM_X2:
1514 1515
        shader_addline(buffer, "ADD T%c, %s, %s;\n", 'A' + tmpreg, regstr, regstr);
        break;
1516
    case WINED3DSPSM_X2NEG:
1517 1518
        shader_addline(buffer, "ADD T%c, -%s, -%s;\n", 'A' + tmpreg, regstr, regstr);
        break;
1519
    case WINED3DSPSM_DZ:
1520 1521 1522
        shader_addline(buffer, "RCP T%c, %s.z;\n", 'A' + tmpreg, regstr);
        shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
        break;
1523
    case WINED3DSPSM_DW:
1524 1525
        shader_addline(buffer, "RCP T%c, %s.w;\n", 'A' + tmpreg, regstr);
        shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1526 1527
        break;
    case WINED3DSPSM_ABS:
1528 1529 1530 1531 1532 1533
        if(ctx->target_version >= NV2) {
            sprintf(outregstr, "|%s%s|", regstr, swzstr);
            insert_line = 0;
        } else {
            shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
        }
1534 1535
        break;
    case WINED3DSPSM_ABSNEG:
1536 1537 1538 1539 1540 1541
        if(ctx->target_version >= NV2) {
            sprintf(outregstr, "-|%s%s|", regstr, swzstr);
        } else {
            shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
            sprintf(outregstr, "-T%c%s", 'A' + tmpreg, swzstr);
        }
1542
        insert_line = 0;
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553
        break;
    default:
        sprintf(outregstr, "%s%s", regstr, swzstr);
        insert_line = 0;
    }

    /* Return modified or original register, with swizzle */
    if (insert_line)
        sprintf(outregstr, "T%c%s", 'A' + tmpreg, swzstr);
}

1554
static void pshader_hw_bem(const struct wined3d_shader_instruction *ins)
1555
{
1556
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1557
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1558 1559
    char dst_name[50];
    char src_name[2][50];
1560
    DWORD sampler_code = dst->reg.idx;
1561

1562
    shader_arb_get_dst_param(ins, dst, dst_name);
1563

1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
    /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
     *
     * Keep in mind that src_name[1] can be "TB" and src_name[0] can be "TA" because modifiers like _x2 are valid
     * with bem. So delay loading the first parameter until after the perturbation calculation which needs two
     * temps is done.
     */
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
    shader_addline(buffer, "SWZ TA, bumpenvmat%d, x, z, 0, 0;\n", sampler_code);
    shader_addline(buffer, "DP3 TC.r, TA, %s;\n", src_name[1]);
    shader_addline(buffer, "SWZ TA, bumpenvmat%d, y, w, 0, 0;\n", sampler_code);
    shader_addline(buffer, "DP3 TC.g, TA, %s;\n", src_name[1]);
1575

1576 1577
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
    shader_addline(buffer, "ADD %s, %s, TC;\n", dst_name, src_name[0]);
1578 1579
}

1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602
static DWORD negate_modifiers(DWORD mod, char *extra_char)
{
    *extra_char = ' ';
    switch(mod)
    {
        case WINED3DSPSM_NONE:      return WINED3DSPSM_NEG;
        case WINED3DSPSM_NEG:       return WINED3DSPSM_NONE;
        case WINED3DSPSM_BIAS:      return WINED3DSPSM_BIASNEG;
        case WINED3DSPSM_BIASNEG:   return WINED3DSPSM_BIAS;
        case WINED3DSPSM_SIGN:      return WINED3DSPSM_SIGNNEG;
        case WINED3DSPSM_SIGNNEG:   return WINED3DSPSM_SIGN;
        case WINED3DSPSM_COMP:      *extra_char = '-'; return WINED3DSPSM_COMP;
        case WINED3DSPSM_X2:        return WINED3DSPSM_X2NEG;
        case WINED3DSPSM_X2NEG:     return WINED3DSPSM_X2;
        case WINED3DSPSM_DZ:        *extra_char = '-'; return WINED3DSPSM_DZ;
        case WINED3DSPSM_DW:        *extra_char = '-'; return WINED3DSPSM_DW;
        case WINED3DSPSM_ABS:       return WINED3DSPSM_ABSNEG;
        case WINED3DSPSM_ABSNEG:    return WINED3DSPSM_ABS;
    }
    FIXME("Unknown modifier %u\n", mod);
    return mod;
}

1603
static void pshader_hw_cnd(const struct wined3d_shader_instruction *ins)
1604
{
1605
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1606
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1607 1608
    char dst_name[50];
    char src_name[3][50];
1609 1610
    DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
            ins->ctx->reg_maps->shader_version.minor);
1611

1612
    shader_arb_get_dst_param(ins, dst, dst_name);
1613
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1614

1615
    /* The coissue flag changes the semantic of the cnd instruction in <= 1.3 shaders */
1616
    if (shader_version <= WINED3D_SHADER_VERSION(1, 3) && ins->coissue)
1617
    {
1618
        shader_addline(buffer, "MOV%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[1]);
1619 1620 1621
    }
    else
    {
1622 1623 1624 1625 1626 1627 1628
        struct wined3d_shader_src_param src0_copy = ins->src[0];
        char extra_neg;

        /* src0 may have a negate srcmod set, so we can't blindly add "-" to the name */
        src0_copy.modifiers = negate_modifiers(src0_copy.modifiers, &extra_neg);

        shader_arb_get_src_param(ins, &src0_copy, 0, src_name[0]);
1629
        shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1630
        shader_addline(buffer, "ADD TA, %c%s, coefdiv.x;\n", extra_neg, src_name[0]);
1631 1632
        shader_addline(buffer, "CMP%s %s, TA, %s, %s;\n", shader_arb_get_modifier(ins),
                dst_name, src_name[1], src_name[2]);
1633
    }
1634 1635
}

1636
static void pshader_hw_cmp(const struct wined3d_shader_instruction *ins)
1637
{
1638
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1639
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1640 1641 1642
    char dst_name[50];
    char src_name[3][50];

1643
    shader_arb_get_dst_param(ins, dst, dst_name);
1644 1645

    /* Generate input register names (with modifiers) */
1646 1647 1648
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
    shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1649

1650 1651
    shader_addline(buffer, "CMP%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
            dst_name, src_name[0], src_name[2], src_name[1]);
1652 1653
}

1654 1655
/** Process the WINED3DSIO_DP2ADD instruction in ARB.
 * dst = dot2(src0, src1) + src2 */
1656
static void pshader_hw_dp2add(const struct wined3d_shader_instruction *ins)
1657
{
1658
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1659
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1660 1661
    char dst_name[50];
    char src_name[3][50];
1662
    struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1663

1664
    shader_arb_get_dst_param(ins, dst, dst_name);
1665 1666
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
    shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1667

1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706
    if(ctx->target_version >= NV3)
    {
        /* GL_NV_fragment_program2 has a 1:1 matching instruction */
        shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
        shader_addline(buffer, "DP2A%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
                       dst_name, src_name[0], src_name[1], src_name[2]);
    }
    else if(ctx->target_version >= NV2)
    {
        /* dst.x = src2.?, src0.x, src1.x + src0.y * src1.y
         * dst.y = src2.?, src0.x, src1.z + src0.y * src1.w
         * dst.z = src2.?, src0.x, src1.x + src0.y * src1.y
         * dst.z = src2.?, src0.x, src1.z + src0.y * src1.w
         *
         * Make sure that src1.zw = src1.xy, then we get a classic dp2add
         *
         * .xyxy and other swizzles that we could get with this are not valid in
         * plain ARBfp, but luckily the NV extension grammar lifts this limitation.
         */
        struct wined3d_shader_src_param tmp_param = ins->src[1];
        DWORD swizzle = tmp_param.swizzle & 0xf; /* Selects .xy */
        tmp_param.swizzle = swizzle | (swizzle << 4); /* Creates .xyxy */

        shader_arb_get_src_param(ins, &tmp_param, 1, src_name[1]);

        shader_addline(buffer, "X2D%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
                       dst_name, src_name[2], src_name[0], src_name[1]);
    }
    else
    {
        shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
        /* Emulate a DP2 with a DP3 and 0.0. Don't use the dest as temp register, it could be src[1] or src[2]
        * src_name[0] can be TA, but TA is a private temp for modifiers, so it is save to overwrite
        */
        shader_addline(buffer, "MOV TA, %s;\n", src_name[0]);
        shader_addline(buffer, "MOV TA.z, 0.0;\n");
        shader_addline(buffer, "DP3 TA, TA, %s;\n", src_name[1]);
        shader_addline(buffer, "ADD%s %s, TA, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[2]);
    }
1707 1708
}

1709
/* Map the opcode 1-to-1 to the GL code */
1710
static void shader_hw_map2gl(const struct wined3d_shader_instruction *ins)
1711
{
1712
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1713
    const char *instruction;
1714
    char arguments[256], dst_str[50];
1715
    unsigned int i;
1716
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1717

1718
    switch (ins->handler_idx)
1719
    {
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736
        case WINED3DSIH_ABS: instruction = "ABS"; break;
        case WINED3DSIH_ADD: instruction = "ADD"; break;
        case WINED3DSIH_CRS: instruction = "XPD"; break;
        case WINED3DSIH_DP3: instruction = "DP3"; break;
        case WINED3DSIH_DP4: instruction = "DP4"; break;
        case WINED3DSIH_DST: instruction = "DST"; break;
        case WINED3DSIH_FRC: instruction = "FRC"; break;
        case WINED3DSIH_LIT: instruction = "LIT"; break;
        case WINED3DSIH_LRP: instruction = "LRP"; break;
        case WINED3DSIH_MAD: instruction = "MAD"; break;
        case WINED3DSIH_MAX: instruction = "MAX"; break;
        case WINED3DSIH_MIN: instruction = "MIN"; break;
        case WINED3DSIH_MOV: instruction = "MOV"; break;
        case WINED3DSIH_MUL: instruction = "MUL"; break;
        case WINED3DSIH_SGE: instruction = "SGE"; break;
        case WINED3DSIH_SLT: instruction = "SLT"; break;
        case WINED3DSIH_SUB: instruction = "SUB"; break;
1737
        case WINED3DSIH_MOVA:instruction = "ARR"; break;
1738
        case WINED3DSIH_DSX: instruction = "DDX"; break;
1739
        default: instruction = "";
1740
            FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1741 1742 1743
            break;
    }

1744 1745 1746 1747 1748 1749 1750 1751 1752
    /* Note that shader_arb_add_dst_param() adds spaces. */
    arguments[0] = '\0';
    shader_arb_get_dst_param(ins, dst, dst_str);
    for (i = 0; i < ins->src_count; ++i)
    {
        char operand[100];
        strcat(arguments, ", ");
        shader_arb_get_src_param(ins, &ins->src[i], i, operand);
        strcat(arguments, operand);
1753
    }
1754
    shader_addline(buffer, "%s%s %s%s;\n", instruction, shader_arb_get_modifier(ins), dst_str, arguments);
1755 1756 1757 1758
}

static void shader_hw_nop(const struct wined3d_shader_instruction *ins)
{
1759
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1760
    shader_addline(buffer, "NOP;\n");
1761 1762
}

1763
static void shader_hw_mov(const struct wined3d_shader_instruction *ins)
1764
{
1765
    const struct wined3d_shader *shader = ins->ctx->shader;
1766
    const struct wined3d_shader_reg_maps *reg_maps = ins->ctx->reg_maps;
1767
    BOOL pshader = shader_is_pshader_version(reg_maps->shader_version.type);
1768
    struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1769 1770 1771
    const char *zero = arb_get_helper_value(reg_maps->shader_version.type, ARB_ZERO);
    const char *one = arb_get_helper_value(reg_maps->shader_version.type, ARB_ONE);
    const char *two = arb_get_helper_value(reg_maps->shader_version.type, ARB_TWO);
1772

1773
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1774
    char src0_param[256];
1775

1776 1777
    if (ins->handler_idx == WINED3DSIH_MOVA)
    {
1778
        const struct arb_vshader_private *shader_data = shader->backend_data;
1779
        char write_mask[6];
1780
        const char *offset = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_VS_REL_OFFSET);
1781

1782 1783 1784 1785
        if(ctx->target_version >= NV2) {
            shader_hw_map2gl(ins);
            return;
        }
1786
        shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1787
        shader_arb_get_write_mask(ins, &ins->dst[0], write_mask);
1788

1789 1790 1791 1792 1793 1794 1795
        /* This implements the mova formula used in GLSL. The first two instructions
         * prepare the sign() part. Note that it is fine to have my_sign(0.0) = 1.0
         * in this case:
         * mova A0.x, 0.0
         *
         * A0.x = arl(floor(abs(0.0) + 0.5) * 1.0) = floor(0.5) = 0.0 since arl does a floor
         *
1796 1797
         * The ARL is performed when A0 is used - the requested component is read from A0_SHADOW into
         * A0.x. We can use the overwritten component of A0_shadow as temporary storage for the sign.
1798
         */
1799 1800
        shader_addline(buffer, "SGE A0_SHADOW%s, %s, %s;\n", write_mask, src0_param, zero);
        shader_addline(buffer, "MAD A0_SHADOW%s, A0_SHADOW, %s, -%s;\n", write_mask, two, one);
1801 1802

        shader_addline(buffer, "ABS TA%s, %s;\n", write_mask, src0_param);
1803
        shader_addline(buffer, "ADD TA%s, TA, rel_addr_const.x;\n", write_mask);
1804
        shader_addline(buffer, "FLR TA%s, TA;\n", write_mask);
1805
        if (shader_data->rel_offset)
1806
        {
1807
            shader_addline(buffer, "ADD TA%s, TA, %s;\n", write_mask, offset);
1808
        }
1809 1810 1811
        shader_addline(buffer, "MUL A0_SHADOW%s, TA, A0_SHADOW;\n", write_mask);

        ((struct shader_arb_ctx_priv *)ins->ctx->backend_data)->addr_reg[0] = '\0';
1812 1813 1814
    }
    else if (reg_maps->shader_version.major == 1
          && !shader_is_pshader_version(reg_maps->shader_version.type)
1815 1816
          && ins->dst[0].reg.type == WINED3DSPR_ADDR)
    {
1817
        const struct arb_vshader_private *shader_data = shader->backend_data;
1818
        src0_param[0] = '\0';
1819 1820

        if (shader_data->rel_offset)
1821
        {
1822
            const char *offset = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_VS_REL_OFFSET);
1823
            shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1824
            shader_addline(buffer, "ADD TA.x, %s, %s;\n", src0_param, offset);
1825
            shader_addline(buffer, "ARL A0.x, TA.x;\n");
1826 1827 1828 1829 1830
        }
        else
        {
            /* Apple's ARB_vertex_program implementation does not accept an ARL source argument
             * with more than one component. Thus replicate the first source argument over all
1831
             * 4 components. For example, .xyzw -> .x (or better: .xxxx), .zwxy -> .z, etc) */
1832
            struct wined3d_shader_src_param tmp_src = ins->src[0];
1833
            tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1834
            shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1835 1836 1837
            shader_addline(buffer, "ARL A0.x, %s;\n", src0_param);
        }
    }
1838
    else if (ins->dst[0].reg.type == WINED3DSPR_COLOROUT && !ins->dst[0].reg.idx && pshader)
1839
    {
1840
        if (ctx->cur_ps_args->super.srgb_correction && shader->u.ps.color0_mov)
1841 1842 1843 1844 1845 1846
        {
            shader_addline(buffer, "#mov handled in srgb write code\n");
            return;
        }
        shader_hw_map2gl(ins);
    }
1847 1848
    else
    {
1849
        shader_hw_map2gl(ins);
1850 1851 1852
    }
}

1853
static void pshader_hw_texkill(const struct wined3d_shader_instruction *ins)
1854
{
1855
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1856
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1857 1858 1859
    char reg_dest[40];

    /* No swizzles are allowed in d3d's texkill. PS 1.x ignores the 4th component as documented,
1860
     * but >= 2.0 honors it (undocumented, but tested by the d3d9 testsuite)
1861
     */
1862
    shader_arb_get_dst_param(ins, dst, reg_dest);
1863

1864
    if (ins->ctx->reg_maps->shader_version.major >= 2)
1865
    {
1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887
        const char *kilsrc = "TA";
        BOOL is_color;

        shader_arb_get_register_name(ins, &dst->reg, reg_dest, &is_color);
        if(dst->write_mask == WINED3DSP_WRITEMASK_ALL)
        {
            kilsrc = reg_dest;
        }
        else
        {
            /* Sigh. KIL doesn't support swizzles/writemasks. KIL passes a writemask, but ".xy" for example
             * is not valid as a swizzle in ARB (needs ".xyyy"). Use SWZ to load the register properly, and set
             * masked out components to 0(won't kill)
             */
            char x = '0', y = '0', z = '0', w = '0';
            if(dst->write_mask & WINED3DSP_WRITEMASK_0) x = 'x';
            if(dst->write_mask & WINED3DSP_WRITEMASK_1) y = 'y';
            if(dst->write_mask & WINED3DSP_WRITEMASK_2) z = 'z';
            if(dst->write_mask & WINED3DSP_WRITEMASK_3) w = 'w';
            shader_addline(buffer, "SWZ TA, %s, %c, %c, %c, %c;\n", reg_dest, x, y, z, w);
        }
        shader_addline(buffer, "KIL %s;\n", kilsrc);
1888 1889 1890
    } else {
        /* ARB fp doesn't like swizzles on the parameter of the KIL instruction. To mask the 4th component,
         * copy the register into our general purpose TMP variable, overwrite .w and pass TMP to KIL
1891 1892 1893
         *
         * ps_1_3 shaders use the texcoord incarnation of the Tx register. ps_1_4 shaders can use the same,
         * or pass in any temporary register(in shader phase 2)
1894
         */
1895 1896 1897 1898 1899
        if(ins->ctx->reg_maps->shader_version.minor <= 3) {
            sprintf(reg_dest, "fragment.texcoord[%u]", dst->reg.idx);
        } else {
            shader_arb_get_dst_param(ins, dst, reg_dest);
        }
1900 1901
        shader_addline(buffer, "SWZ TA, %s, x, y, z, 1;\n", reg_dest);
        shader_addline(buffer, "KIL TA;\n");
1902 1903 1904
    }
}

1905
static void pshader_hw_tex(const struct wined3d_shader_instruction *ins)
1906
{
1907
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1908
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1909 1910
    DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
            ins->ctx->reg_maps->shader_version.minor);
1911
    struct wined3d_shader_src_param src;
1912 1913 1914 1915

    char reg_dest[40];
    char reg_coord[40];
    DWORD reg_sampler_code;
1916
    WORD myflags = 0;
1917
    BOOL swizzle_coord = FALSE;
1918 1919

    /* All versions have a destination register */
1920
    shader_arb_get_dst_param(ins, dst, reg_dest);
1921

1922 1923 1924 1925 1926 1927 1928 1929
    /* 1.0-1.4: Use destination register number as texture code.
       2.0+: Use provided sampler number as texure code. */
    if (shader_version < WINED3D_SHADER_VERSION(2,0))
        reg_sampler_code = dst->reg.idx;
    else
        reg_sampler_code = ins->src[1].reg.idx;

    /* 1.0-1.3: Use the texcoord varying.
1930
       1.4+: Use provided coordinate source register. */
1931
    if (shader_version < WINED3D_SHADER_VERSION(1,4))
1932
        sprintf(reg_coord, "fragment.texcoord[%u]", reg_sampler_code);
1933 1934 1935 1936 1937 1938 1939
    else {
        /* TEX is the only instruction that can handle DW and DZ natively */
        src = ins->src[0];
        if(src.modifiers == WINED3DSPSM_DW) src.modifiers = WINED3DSPSM_NONE;
        if(src.modifiers == WINED3DSPSM_DZ) src.modifiers = WINED3DSPSM_NONE;
        shader_arb_get_src_param(ins, &src, 0, reg_coord);
    }
1940 1941

    /* projection flag:
1942
     * 1.1, 1.2, 1.3: Use WINED3D_TSS_TEXTURETRANSFORMFLAGS
1943 1944 1945 1946 1947 1948
     * 1.4: Use WINED3DSPSM_DZ or WINED3DSPSM_DW on src[0]
     * 2.0+: Use WINED3DSI_TEXLD_PROJECT on the opcode
     */
    if (shader_version < WINED3D_SHADER_VERSION(1,4))
    {
        DWORD flags = 0;
1949
        if (reg_sampler_code < MAX_TEXTURES)
1950 1951
            flags = priv->cur_ps_args->super.tex_transform >> reg_sampler_code * WINED3D_PSARGS_TEXTRANSFORM_SHIFT;
        if (flags & WINED3D_PSARGS_PROJECTED)
1952
        {
1953
            myflags |= TEX_PROJ;
1954 1955 1956
            if ((flags & ~WINED3D_PSARGS_PROJECTED) == WINED3D_TTFF_COUNT3)
                swizzle_coord = TRUE;
        }
1957 1958 1959
    }
    else if (shader_version < WINED3D_SHADER_VERSION(2,0))
    {
1960 1961 1962
        enum wined3d_shader_src_modifier src_mod = ins->src[0].modifiers;
        if (src_mod == WINED3DSPSM_DZ)
        {
1963
            swizzle_coord = TRUE;
1964
            myflags |= TEX_PROJ;
1965
        } else if(src_mod == WINED3DSPSM_DW) {
1966
            myflags |= TEX_PROJ;
1967 1968
        }
    } else {
1969 1970
        if (ins->flags & WINED3DSI_TEXLD_PROJECT) myflags |= TEX_PROJ;
        if (ins->flags & WINED3DSI_TEXLD_BIAS) myflags |= TEX_BIAS;
1971
    }
1972 1973 1974 1975 1976 1977 1978 1979 1980

    if (swizzle_coord)
    {
        /* TXP cannot handle DZ natively, so move the z coordinate to .w.
         * reg_coord is a read-only varying register, so we need a temp reg */
        shader_addline(ins->ctx->buffer, "SWZ TA, %s, x, y, z, z;\n", reg_coord);
        strcpy(reg_coord, "TA");
    }

1981
    shader_hw_sample(ins, reg_sampler_code, reg_dest, reg_coord, myflags, NULL, NULL);
1982 1983
}

1984
static void pshader_hw_texcoord(const struct wined3d_shader_instruction *ins)
1985
{
1986
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1987
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1988 1989
    DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
            ins->ctx->reg_maps->shader_version.minor);
1990
    char dst_str[50];
1991

1992
    if (shader_version < WINED3D_SHADER_VERSION(1,4))
1993
    {
1994
        DWORD reg = dst->reg.idx;
1995 1996 1997

        shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
        shader_addline(buffer, "MOV_SAT %s, fragment.texcoord[%u];\n", dst_str, reg);
1998
    } else {
1999 2000
        char reg_src[40];

2001
        shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src);
2002 2003
        shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
        shader_addline(buffer, "MOV %s, %s;\n", dst_str, reg_src);
2004 2005 2006
   }
}

2007
static void pshader_hw_texreg2ar(const struct wined3d_shader_instruction *ins)
2008
{
2009
     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2010
     DWORD flags = 0;
2011

2012
     DWORD reg1 = ins->dst[0].reg.idx;
2013
     char dst_str[50];
2014
     char src_str[50];
2015

2016
     /* Note that texreg2ar treats Tx as a temporary register, not as a varying */
2017
     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2018
     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
2019 2020 2021
     /* Move .x first in case src_str is "TA" */
     shader_addline(buffer, "MOV TA.y, %s.x;\n", src_str);
     shader_addline(buffer, "MOV TA.x, %s.w;\n", src_str);
2022 2023 2024 2025 2026 2027
     if (reg1 < MAX_TEXTURES)
     {
         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
         flags = priv->cur_ps_args->super.tex_transform >> reg1 * WINED3D_PSARGS_TEXTRANSFORM_SHIFT;
     }
     shader_hw_sample(ins, reg1, dst_str, "TA", flags & WINED3D_PSARGS_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2028 2029
}

2030
static void pshader_hw_texreg2gb(const struct wined3d_shader_instruction *ins)
2031
{
2032
     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2033

2034
     DWORD reg1 = ins->dst[0].reg.idx;
2035
     char dst_str[50];
2036
     char src_str[50];
2037

2038
     /* Note that texreg2gb treats Tx as a temporary register, not as a varying */
2039
     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2040
     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
2041 2042
     shader_addline(buffer, "MOV TA.x, %s.y;\n", src_str);
     shader_addline(buffer, "MOV TA.y, %s.z;\n", src_str);
2043
     shader_hw_sample(ins, reg1, dst_str, "TA", 0, NULL, NULL);
2044 2045
}

2046
static void pshader_hw_texreg2rgb(const struct wined3d_shader_instruction *ins)
2047
{
2048
    DWORD reg1 = ins->dst[0].reg.idx;
2049
    char dst_str[50];
2050 2051
    char src_str[50];

2052
    /* Note that texreg2rg treats Tx as a temporary register, not as a varying */
2053
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2054
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
2055
    shader_hw_sample(ins, reg1, dst_str, src_str, 0, NULL, NULL);
2056 2057
}

2058
static void pshader_hw_texbem(const struct wined3d_shader_instruction *ins)
2059
{
2060
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2061
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2062
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2063
    char reg_coord[40], dst_reg[50], src_reg[50];
2064 2065
    DWORD reg_dest_code;

2066 2067 2068
    /* All versions have a destination register. The Tx where the texture coordinates come
     * from is the varying incarnation of the texture register
     */
2069
    reg_dest_code = dst->reg.idx;
2070
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_reg);
2071
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_reg);
2072
    sprintf(reg_coord, "fragment.texcoord[%u]", reg_dest_code);
2073

2074 2075
    /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
     * The Tx in which the perturbation map is stored is the tempreg incarnation of the texture register
2076 2077 2078 2079 2080 2081 2082
     *
     * GL_NV_fragment_program_option could handle this in one instruction via X2D:
     * X2D TA.xy, fragment.texcoord, T%u, bumpenvmat%u.xzyw
     *
     * However, the NV extensions are never enabled for <= 2.0 shaders because of the performance penalty that
     * comes with it, and texbem is an 1.x only instruction. No 1.x instruction forces us to enable the NV
     * extension.
2083
     */
2084
    shader_addline(buffer, "SWZ TB, bumpenvmat%d, x, z, 0, 0;\n", reg_dest_code);
2085
    shader_addline(buffer, "DP3 TA.x, TB, %s;\n", src_reg);
2086
    shader_addline(buffer, "SWZ TB, bumpenvmat%d, y, w, 0, 0;\n", reg_dest_code);
2087
    shader_addline(buffer, "DP3 TA.y, TB, %s;\n", src_reg);
2088

2089 2090 2091
    /* with projective textures, texbem only divides the static texture coord, not the displacement,
     * so we can't let the GL handle this.
     */
2092 2093
    if ((priv->cur_ps_args->super.tex_transform >> reg_dest_code * WINED3D_PSARGS_TEXTRANSFORM_SHIFT)
            & WINED3D_PSARGS_PROJECTED)
2094
    {
2095 2096 2097
        shader_addline(buffer, "RCP TB.w, %s.w;\n", reg_coord);
        shader_addline(buffer, "MUL TB.xy, %s, TB.w;\n", reg_coord);
        shader_addline(buffer, "ADD TA.xy, TA, TB;\n");
2098
    } else {
2099
        shader_addline(buffer, "ADD TA.xy, TA, %s;\n", reg_coord);
2100
    }
2101

2102
    shader_hw_sample(ins, reg_dest_code, dst_reg, "TA", 0, NULL, NULL);
2103

2104 2105
    if (ins->handler_idx == WINED3DSIH_TEXBEML)
    {
2106 2107 2108
        /* No src swizzles are allowed, so this is ok */
        shader_addline(buffer, "MAD TA, %s.z, luminance%d.x, luminance%d.y;\n",
                       src_reg, reg_dest_code, reg_dest_code);
2109
        shader_addline(buffer, "MUL %s, %s, TA;\n", dst_reg, dst_reg);
2110
    }
2111 2112
}

2113
static void pshader_hw_texm3x2pad(const struct wined3d_shader_instruction *ins)
2114
{
2115
    DWORD reg = ins->dst[0].reg.idx;
2116
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2117 2118 2119
    char src0_name[50], dst_name[50];
    BOOL is_color;
    struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
2120

2121
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2122 2123 2124
    /* The next instruction will be a texm3x2tex or texm3x2depth that writes to the uninitialized
     * T<reg+1> register. Use this register to store the calculated vector
     */
2125 2126 2127
    tmp_reg.idx = reg + 1;
    shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
    shader_addline(buffer, "DP3 %s.x, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
2128 2129
}

2130
static void pshader_hw_texm3x2tex(const struct wined3d_shader_instruction *ins)
2131
{
2132
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2133
    DWORD flags;
2134
    DWORD reg = ins->dst[0].reg.idx;
2135
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2136
    char dst_str[50];
2137
    char src0_name[50];
2138
    char dst_reg[50];
2139
    BOOL is_color;
2140 2141

    /* We know that we're writing to the uninitialized T<reg> register, so use it for temporary storage */
2142
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
2143

2144
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2145
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2146
    shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
2147
    flags = reg < MAX_TEXTURES ? priv->cur_ps_args->super.tex_transform >> reg * WINED3D_PSARGS_TEXTRANSFORM_SHIFT : 0;
2148
    shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3D_PSARGS_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2149 2150
}

2151
static void pshader_hw_texm3x3pad(const struct wined3d_shader_instruction *ins)
2152
{
2153
    struct wined3d_shader_tex_mx *tex_mx = ins->ctx->tex_mx;
2154
    DWORD reg = ins->dst[0].reg.idx;
2155
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2156 2157 2158
    char src0_name[50], dst_name[50];
    struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
    BOOL is_color;
2159 2160 2161 2162 2163

    /* There are always 2 texm3x3pad instructions followed by one texm3x3[tex,vspec, ...] instruction, with
     * incrementing ins->dst[0].register_idx numbers. So the pad instruction already knows the final destination
     * register, and this register is uninitialized(otherwise the assembler complains that it is 'redeclared')
     */
2164
    tmp_reg.idx = reg + 2 - tex_mx->current_row;
2165
    shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
2166

2167
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2168
    shader_addline(buffer, "DP3 %s.%c, fragment.texcoord[%u], %s;\n",
2169 2170
                   dst_name, 'x' + tex_mx->current_row, reg, src0_name);
    tex_mx->texcoord_w[tex_mx->current_row++] = reg;
2171 2172
}

2173
static void pshader_hw_texm3x3tex(const struct wined3d_shader_instruction *ins)
2174
{
2175
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2176
    struct wined3d_shader_tex_mx *tex_mx = ins->ctx->tex_mx;
2177
    DWORD flags;
2178
    DWORD reg = ins->dst[0].reg.idx;
2179
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2180
    char dst_str[50];
2181 2182
    char src0_name[50], dst_name[50];
    BOOL is_color;
2183

2184
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2185
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2186
    shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
2187

2188
    /* Sample the texture using the calculated coordinates */
2189
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2190
    flags = reg < MAX_TEXTURES ? priv->cur_ps_args->super.tex_transform >> reg * WINED3D_PSARGS_TEXTRANSFORM_SHIFT : 0;
2191
    shader_hw_sample(ins, reg, dst_str, dst_name, flags & WINED3D_PSARGS_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2192
    tex_mx->current_row = 0;
2193 2194
}

2195
static void pshader_hw_texm3x3vspec(const struct wined3d_shader_instruction *ins)
2196
{
2197
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2198
    struct wined3d_shader_tex_mx *tex_mx = ins->ctx->tex_mx;
2199
    DWORD flags;
2200
    DWORD reg = ins->dst[0].reg.idx;
2201
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2202
    char dst_str[50];
2203
    char src0_name[50];
2204
    char dst_reg[50];
2205
    BOOL is_color;
2206

2207 2208 2209
    /* Get the dst reg without writemask strings. We know this register is uninitialized, so we can use all
     * components for temporary data storage
     */
2210
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
2211
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2212
    shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
2213 2214

    /* Construct the eye-ray vector from w coordinates */
2215 2216
    shader_addline(buffer, "MOV TB.x, fragment.texcoord[%u].w;\n", tex_mx->texcoord_w[0]);
    shader_addline(buffer, "MOV TB.y, fragment.texcoord[%u].w;\n", tex_mx->texcoord_w[1]);
2217
    shader_addline(buffer, "MOV TB.z, fragment.texcoord[%u].w;\n", reg);
2218

2219 2220
    /* Calculate reflection vector
     */
2221
    shader_addline(buffer, "DP3 %s.w, %s, TB;\n", dst_reg, dst_reg);
2222
    /* The .w is ignored when sampling, so I can use TB.w to calculate dot(N, N) */
2223
    shader_addline(buffer, "DP3 TB.w, %s, %s;\n", dst_reg, dst_reg);
2224
    shader_addline(buffer, "RCP TB.w, TB.w;\n");
2225 2226 2227
    shader_addline(buffer, "MUL %s.w, %s.w, TB.w;\n", dst_reg, dst_reg);
    shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
    shader_addline(buffer, "MAD %s, coefmul.x, %s, -TB;\n", dst_reg, dst_reg);
2228

2229
    /* Sample the texture using the calculated coordinates */
2230
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2231
    flags = reg < MAX_TEXTURES ? priv->cur_ps_args->super.tex_transform >> reg * WINED3D_PSARGS_TEXTRANSFORM_SHIFT : 0;
2232
    shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3D_PSARGS_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2233
    tex_mx->current_row = 0;
2234 2235
}

2236
static void pshader_hw_texm3x3spec(const struct wined3d_shader_instruction *ins)
2237
{
2238
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2239
    struct wined3d_shader_tex_mx *tex_mx = ins->ctx->tex_mx;
2240
    DWORD flags;
2241
    DWORD reg = ins->dst[0].reg.idx;
2242
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2243
    char dst_str[50];
2244
    char src0_name[50];
2245
    char src1_name[50];
2246
    char dst_reg[50];
2247
    BOOL is_color;
2248

2249 2250
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
    shader_arb_get_src_param(ins, &ins->src[0], 1, src1_name);
2251 2252
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
    /* Note: dst_reg.xy is input here, generated by two texm3x3pad instructions */
2253
    shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
2254

2255 2256
    /* Calculate reflection vector.
     *
2257 2258 2259
     *                   dot(N, E)
     * dst_reg.xyz = 2 * --------- * N - E
     *                   dot(N, N)
2260 2261 2262
     *
     * Which normalizes the normal vector
     */
2263 2264
    shader_addline(buffer, "DP3 %s.w, %s, %s;\n", dst_reg, dst_reg, src1_name);
    shader_addline(buffer, "DP3 TC.w, %s, %s;\n", dst_reg, dst_reg);
2265
    shader_addline(buffer, "RCP TC.w, TC.w;\n");
2266 2267 2268
    shader_addline(buffer, "MUL %s.w, %s.w, TC.w;\n", dst_reg, dst_reg);
    shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
    shader_addline(buffer, "MAD %s, coefmul.x, %s, -%s;\n", dst_reg, dst_reg, src1_name);
2269

2270
    /* Sample the texture using the calculated coordinates */
2271
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2272
    flags = reg < MAX_TEXTURES ? priv->cur_ps_args->super.tex_transform >> reg * WINED3D_PSARGS_TEXTRANSFORM_SHIFT : 0;
2273
    shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3D_PSARGS_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2274
    tex_mx->current_row = 0;
2275 2276
}

2277
static void pshader_hw_texdepth(const struct wined3d_shader_instruction *ins)
2278
{
2279
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2280
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2281
    char dst_name[50];
2282 2283
    const char *zero = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ZERO);
    const char *one = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ONE);
2284 2285

    /* texdepth has an implicit destination, the fragment depth value. It's only parameter,
2286 2287
     * which is essentially an input, is the destination register because it is the first
     * parameter. According to the msdn, this must be register r5, but let's keep it more flexible
2288
     * here(writemasks/swizzles are not valid on texdepth)
2289
     */
2290
    shader_arb_get_dst_param(ins, dst, dst_name);
2291 2292 2293 2294

    /* According to the msdn, the source register(must be r5) is unusable after
     * the texdepth instruction, so we're free to modify it
     */
2295
    shader_addline(buffer, "MIN %s.y, %s.y, %s;\n", dst_name, dst_name, one);
2296 2297 2298 2299 2300

    /* How to deal with the special case dst_name.g == 0? if r != 0, then
     * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
     * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
     */
2301
    shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2302
    shader_addline(buffer, "MUL TA.x, %s.x, %s.y;\n", dst_name, dst_name);
2303 2304
    shader_addline(buffer, "MIN TA.x, TA.x, %s;\n", one);
    shader_addline(buffer, "MAX result.depth, TA.x, %s;\n", zero);
2305 2306
}

2307 2308 2309
/** Process the WINED3DSIO_TEXDP3TEX instruction in ARB:
 * Take a 3-component dot product of the TexCoord[dstreg] and src,
 * then perform a 1D texture lookup from stage dstregnum, place into dst. */
2310
static void pshader_hw_texdp3tex(const struct wined3d_shader_instruction *ins)
2311
{
2312
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2313
    DWORD sampler_idx = ins->dst[0].reg.idx;
2314
    char src0[50];
2315
    char dst_str[50];
2316

2317
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2318 2319
    shader_addline(buffer, "MOV TB, 0.0;\n");
    shader_addline(buffer, "DP3 TB.x, fragment.texcoord[%u], %s;\n", sampler_idx, src0);
2320

2321
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2322
    shader_hw_sample(ins, sampler_idx, dst_str, "TB", 0 /* Only one coord, can't be projected */, NULL, NULL);
2323 2324 2325 2326
}

/** Process the WINED3DSIO_TEXDP3 instruction in ARB:
 * Take a 3-component dot product of the TexCoord[dstreg] and src. */
2327
static void pshader_hw_texdp3(const struct wined3d_shader_instruction *ins)
2328
{
2329
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2330 2331
    char src0[50];
    char dst_str[50];
2332
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2333 2334

    /* Handle output register */
2335
    shader_arb_get_dst_param(ins, dst, dst_str);
2336
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2337
    shader_addline(buffer, "DP3 %s, fragment.texcoord[%u], %s;\n", dst_str, dst->reg.idx, src0);
2338 2339 2340 2341
}

/** Process the WINED3DSIO_TEXM3X3 instruction in ARB
 * Perform the 3rd row of a 3x3 matrix multiply */
2342
static void pshader_hw_texm3x3(const struct wined3d_shader_instruction *ins)
2343
{
2344
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2345
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2346
    char dst_str[50], dst_name[50];
2347
    char src0[50];
2348
    BOOL is_color;
2349

2350
    shader_arb_get_dst_param(ins, dst, dst_str);
2351
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2352 2353 2354
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
    shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
    shader_addline(buffer, "MOV %s, %s;\n", dst_str, dst_name);
2355 2356 2357 2358 2359 2360 2361
}

/** Process the WINED3DSIO_TEXM3X2DEPTH instruction in ARB:
 * Last row of a 3x2 matrix multiply, use the result to calculate the depth:
 * Calculate tmp0.y = TexCoord[dstreg] . src.xyz;  (tmp0.x has already been calculated)
 * depth = (tmp0.y == 0.0) ? 1.0 : tmp0.x / tmp0.y
 */
2362
static void pshader_hw_texm3x2depth(const struct wined3d_shader_instruction *ins)
2363
{
2364
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2365 2366 2367
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
    char src0[50], dst_name[50];
    BOOL is_color;
2368 2369
    const char *zero = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ZERO);
    const char *one = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ONE);
2370

2371
    shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2372 2373
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
    shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2374 2375 2376 2377 2378

    /* How to deal with the special case dst_name.g == 0? if r != 0, then
     * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
     * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
     */
2379 2380
    shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
    shader_addline(buffer, "MUL %s.x, %s.x, %s.y;\n", dst_name, dst_name, dst_name);
2381 2382
    shader_addline(buffer, "MIN %s.x, %s.x, %s;\n", dst_name, dst_name, one);
    shader_addline(buffer, "MAX result.depth, %s.x, %s;\n", dst_name, zero);
2383 2384
}

2385
/** Handles transforming all WINED3DSIO_M?x? opcodes for
2386
    Vertex/Pixel shaders to ARB_vertex_program codes */
2387
static void shader_hw_mnxn(const struct wined3d_shader_instruction *ins)
2388
{
2389 2390
    int i;
    int nComponents = 0;
2391 2392
    struct wined3d_shader_dst_param tmp_dst = {{0}};
    struct wined3d_shader_src_param tmp_src[2] = {{{0}}};
2393
    struct wined3d_shader_instruction tmp_ins;
2394

2395
    memset(&tmp_ins, 0, sizeof(tmp_ins));
2396

2397
    /* Set constants for the temporary argument */
2398
    tmp_ins.ctx = ins->ctx;
2399
    tmp_ins.dst_count = 1;
2400
    tmp_ins.dst = &tmp_dst;
2401
    tmp_ins.src_count = 2;
2402
    tmp_ins.src = tmp_src;
2403

2404
    switch(ins->handler_idx)
2405
    {
2406 2407
        case WINED3DSIH_M4x4:
            nComponents = 4;
2408
            tmp_ins.handler_idx = WINED3DSIH_DP4;
2409 2410 2411
            break;
        case WINED3DSIH_M4x3:
            nComponents = 3;
2412
            tmp_ins.handler_idx = WINED3DSIH_DP4;
2413 2414 2415
            break;
        case WINED3DSIH_M3x4:
            nComponents = 4;
2416
            tmp_ins.handler_idx = WINED3DSIH_DP3;
2417 2418 2419
            break;
        case WINED3DSIH_M3x3:
            nComponents = 3;
2420
            tmp_ins.handler_idx = WINED3DSIH_DP3;
2421 2422 2423
            break;
        case WINED3DSIH_M3x2:
            nComponents = 2;
2424
            tmp_ins.handler_idx = WINED3DSIH_DP3;
2425 2426
            break;
        default:
2427
            FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2428
            break;
2429 2430
    }

2431
    tmp_dst = ins->dst[0];
2432 2433
    tmp_src[0] = ins->src[0];
    tmp_src[1] = ins->src[1];
2434
    for (i = 0; i < nComponents; i++) {
2435
        tmp_dst.write_mask = WINED3DSP_WRITEMASK_0 << i;
2436
        shader_hw_map2gl(&tmp_ins);
2437
        ++tmp_src[1].reg.idx;
2438 2439 2440
    }
}

2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457
static void shader_hw_rcp(const struct wined3d_shader_instruction *ins)
{
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;

    char dst[50];
    char src[50];

    shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
    shader_arb_get_src_param(ins, &ins->src[0], 0, src);
    if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
    {
        /* Dx sdk says .x is used if no swizzle is given, but our test shows that
         * .w is used
         */
        strcat(src, ".w");
    }

2458
    shader_addline(buffer, "RCP%s %s, %s;\n", shader_arb_get_modifier(ins), dst, src);
2459 2460
}

2461
static void shader_hw_scalar_op(const struct wined3d_shader_instruction *ins)
2462
{
2463
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2464
    const char *instruction;
2465

2466
    char dst[50];
2467
    char src[50];
2468

2469
    switch(ins->handler_idx)
2470
    {
2471 2472 2473 2474
        case WINED3DSIH_RSQ:  instruction = "RSQ"; break;
        case WINED3DSIH_RCP:  instruction = "RCP"; break;
        case WINED3DSIH_EXP:  instruction = "EX2"; break;
        case WINED3DSIH_EXPP: instruction = "EXP"; break;
2475
        default: instruction = "";
2476
            FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2477 2478 2479
            break;
    }

2480
    shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
2481
    shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2482 2483
    if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
    {
2484 2485 2486
        /* Dx sdk says .x is used if no swizzle is given, but our test shows that
         * .w is used
         */
2487
        strcat(src, ".w");
2488 2489
    }

2490
    shader_addline(buffer, "%s%s %s, %s;\n", instruction, shader_arb_get_modifier(ins), dst, src);
2491 2492
}

2493
static void shader_hw_nrm(const struct wined3d_shader_instruction *ins)
2494
{
2495
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2496 2497
    char dst_name[50];
    char src_name[50];
2498 2499
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
2500
    const char *zero = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ZERO);
2501

2502
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2503
    shader_arb_get_src_param(ins, &ins->src[0], 1 /* Use TB */, src_name);
2504

2505 2506
    /* In D3D, NRM of a vector with length zero returns zero. Catch this situation, as
     * otherwise NRM or RSQ would return NaN */
2507 2508
    if(pshader && priv->target_version >= NV3)
    {
2509 2510 2511 2512 2513
        /* GL_NV_fragment_program2's NRM needs protection against length zero vectors too
         *
         * TODO: Find out if DP3+NRM+MOV is really faster than DP3+RSQ+MUL
         */
        shader_addline(buffer, "DP3C TA, %s, %s;\n", src_name, src_name);
2514
        shader_addline(buffer, "NRM%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2515 2516 2517 2518 2519 2520 2521 2522
        shader_addline(buffer, "MOV %s (EQ), %s;\n", dst_name, zero);
    }
    else if(priv->target_version >= NV2)
    {
        shader_addline(buffer, "DP3C TA.x, %s, %s;\n", src_name, src_name);
        shader_addline(buffer, "RSQ TA.x (NE), TA.x;\n");
        shader_addline(buffer, "MUL%s %s, %s, TA.x;\n", shader_arb_get_modifier(ins), dst_name,
                       src_name);
2523 2524 2525
    }
    else
    {
2526 2527 2528 2529 2530 2531 2532 2533 2534 2535
        const char *one = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ONE);

        shader_addline(buffer, "DP3 TA.x, %s, %s;\n", src_name, src_name);
        /* Pass any non-zero value to RSQ if the input vector has a length of zero. The
         * RSQ result doesn't matter, as long as multiplying it by 0 returns 0.
         */
        shader_addline(buffer, "SGE TA.y, -TA.x, %s;\n", zero);
        shader_addline(buffer, "MAD TA.x, %s, TA.y, TA.x;\n", one);

        shader_addline(buffer, "RSQ TA.x, TA.x;\n");
2536
        /* dst.w = src[0].w * 1 / (src.x^2 + src.y^2 + src.z^2)^(1/2) according to msdn*/
2537
        shader_addline(buffer, "MUL%s %s, %s, TA.x;\n", shader_arb_get_modifier(ins), dst_name,
2538 2539
                    src_name);
    }
2540 2541
}

2542 2543
static void shader_hw_lrp(const struct wined3d_shader_instruction *ins)
{
2544
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563
    char dst_name[50];
    char src_name[3][50];

    /* ARB_fragment_program has a convenient LRP instruction */
    if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
        shader_hw_map2gl(ins);
        return;
    }

    shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
    shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);

    shader_addline(buffer, "SUB TA, %s, %s;\n", src_name[1], src_name[2]);
    shader_addline(buffer, "MAD%s %s, %s, TA, %s;\n", shader_arb_get_modifier(ins),
                   dst_name, src_name[0], src_name[2]);
}

2564
static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
2565
{
2566
    /* This instruction exists in ARB, but the d3d instruction takes two extra parameters which
2567
     * must contain fixed constants. So we need a separate function to filter those constants and
2568 2569
     * can't use map2gl
     */
2570
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2571 2572
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2573
    char dst_name[50];
2574 2575
    char src_name0[50], src_name1[50], src_name2[50];
    BOOL is_color;
2576

2577 2578 2579
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
    if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
        shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2580 2581 2582 2583 2584 2585 2586 2587
        /* No modifiers are supported on SCS */
        shader_addline(buffer, "SCS %s, %s;\n", dst_name, src_name0);

        if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
        {
            shader_arb_get_register_name(ins, &dst->reg, src_name0, &is_color);
            shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name0);
        }
2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668
    } else if(priv->target_version >= NV2) {
        shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);

        /* Sincos writemask must be .x, .y or .xy */
        if(dst->write_mask & WINED3DSP_WRITEMASK_0)
            shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
        if(dst->write_mask & WINED3DSP_WRITEMASK_1)
            shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
    } else {
        /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
         * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
         *
         * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
         * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
         *
         * The constants we get are:
         *
         *  +1   +1,     -1     -1     +1      +1      -1       -1
         *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
         *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
         *
         * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
         *
         * (x/2)^2 = x^2 / 4
         * (x/2)^3 = x^3 / 8
         * (x/2)^4 = x^4 / 16
         * (x/2)^5 = x^5 / 32
         * etc
         *
         * To get the final result:
         * sin(x) = 2 * sin(x/2) * cos(x/2)
         * cos(x) = cos(x/2)^2 - sin(x/2)^2
         * (from sin(x+y) and cos(x+y) rules)
         *
         * As per MSDN, dst.z is undefined after the operation, and so is
         * dst.x and dst.y if they're masked out by the writemask. Ie
         * sincos dst.y, src1, c0, c1
         * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
         * vsa.exe also stops with an error if the dest register is the same register as the source
         * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
         * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
         */
        shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
        shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
        shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);

        shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
        shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
        shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
        shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
        shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
        shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */

        /* sin(x/2)
         *
         * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
         * properly merge that with MULs in the code above?
         * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
         * we can merge the sine and cosine MAD rows to calculate them together.
         */
        shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
        shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
        shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
        shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */

        /* cos(x/2) */
        shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
        shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
        shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */

        if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
            /* cos x */
            shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
            shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
        }
        if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
            /* sin x */
            shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
            shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
        }
    }
2669 2670
}

2671 2672
static void shader_hw_sgn(const struct wined3d_shader_instruction *ins)
{
2673
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2674 2675 2676 2677
    char dst_name[50];
    char src_name[50];
    struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;

2678 2679 2680
    shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);

2681
    /* SGN is only valid in vertex shaders */
2682 2683
    if(ctx->target_version >= NV2) {
        shader_addline(buffer, "SSG%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2684 2685 2686 2687 2688 2689 2690 2691 2692
        return;
    }

    /* If SRC > 0.0, -SRC < SRC = TRUE, otherwise false.
     * if SRC < 0.0,  SRC < -SRC = TRUE. If neither is true, src = 0.0
     */
    if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) {
        shader_addline(buffer, "SLT %s, -%s, %s;\n", dst_name, src_name, src_name);
    } else {
Stefan Dösinger's avatar
Stefan Dösinger committed
2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
        /* src contains TA? Write to the dest first. This won't overwrite our destination.
         * Then use TA, and calculate the final result
         *
         * Not reading from TA? Store the first result in TA to avoid overwriting the
         * destination if src reg = dst reg
         */
        if(strstr(src_name, "TA"))
        {
            shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
            shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
            shader_addline(buffer, "ADD %s, %s, -TA;\n", dst_name, dst_name);
        }
        else
        {
            shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
            shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
            shader_addline(buffer, "ADD %s, TA, -%s;\n", dst_name, dst_name);
        }
2711 2712 2713
    }
}

2714 2715
static void shader_hw_dsy(const struct wined3d_shader_instruction *ins)
{
2716
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729
    char src[50];
    char dst[50];
    char dst_name[50];
    BOOL is_color;

    shader_arb_get_dst_param(ins, &ins->dst[0], dst);
    shader_arb_get_src_param(ins, &ins->src[0], 0, src);
    shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);

    shader_addline(buffer, "DDY %s, %s;\n", dst, src);
    shader_addline(buffer, "MUL%s %s, %s, ycorrection.y;\n", shader_arb_get_modifier(ins), dst, dst_name);
}

2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753
static DWORD abs_modifier(DWORD mod, BOOL *need_abs)
{
    *need_abs = FALSE;

    switch(mod)
    {
        case WINED3DSPSM_NONE:      return WINED3DSPSM_ABS;
        case WINED3DSPSM_NEG:       return WINED3DSPSM_ABS;
        case WINED3DSPSM_BIAS:      *need_abs = TRUE; return WINED3DSPSM_BIAS;
        case WINED3DSPSM_BIASNEG:   *need_abs = TRUE; return WINED3DSPSM_BIASNEG;
        case WINED3DSPSM_SIGN:      *need_abs = TRUE; return WINED3DSPSM_SIGN;
        case WINED3DSPSM_SIGNNEG:   *need_abs = TRUE; return WINED3DSPSM_SIGNNEG;
        case WINED3DSPSM_COMP:      *need_abs = TRUE; return WINED3DSPSM_COMP;
        case WINED3DSPSM_X2:        *need_abs = TRUE; return WINED3DSPSM_X2;
        case WINED3DSPSM_X2NEG:     *need_abs = TRUE; return WINED3DSPSM_X2NEG;
        case WINED3DSPSM_DZ:        *need_abs = TRUE; return WINED3DSPSM_DZ;
        case WINED3DSPSM_DW:        *need_abs = TRUE; return WINED3DSPSM_DW;
        case WINED3DSPSM_ABS:       return WINED3DSPSM_ABS;
        case WINED3DSPSM_ABSNEG:    return WINED3DSPSM_ABS;
    }
    FIXME("Unknown modifier %u\n", mod);
    return mod;
}

2754
static void shader_hw_log(const struct wined3d_shader_instruction *ins)
2755
{
2756
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2757
    char src0[50], dst[50];
2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770
    struct wined3d_shader_src_param src0_copy = ins->src[0];
    BOOL need_abs = FALSE;
    const char *instr;

    switch(ins->handler_idx)
    {
        case WINED3DSIH_LOG:  instr = "LG2"; break;
        case WINED3DSIH_LOGP: instr = "LOG"; break;
        default:
            ERR("Unexpected instruction %d\n", ins->handler_idx);
            return;
    }

2771
    /* LOG and LOGP operate on the absolute value of the input */
2772 2773 2774 2775 2776 2777 2778 2779
    src0_copy.modifiers = abs_modifier(src0_copy.modifiers, &need_abs);

    shader_arb_get_dst_param(ins, &ins->dst[0], dst);
    shader_arb_get_src_param(ins, &src0_copy, 0, src0);

    if(need_abs)
    {
        shader_addline(buffer, "ABS TA, %s;\n", src0);
2780
        shader_addline(buffer, "%s%s %s, TA;\n", instr, shader_arb_get_modifier(ins), dst);
2781
    }
2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793
    else
    {
        shader_addline(buffer, "%s%s %s, %s;\n", instr, shader_arb_get_modifier(ins), dst, src0);
    }
}

static void shader_hw_pow(const struct wined3d_shader_instruction *ins)
{
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
    char src0[50], src1[50], dst[50];
    struct wined3d_shader_src_param src0_copy = ins->src[0];
    BOOL need_abs = FALSE;
2794 2795
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    const char *one = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ONE);
2796 2797 2798 2799 2800 2801 2802 2803 2804 2805

    /* POW operates on the absolute value of the input */
    src0_copy.modifiers = abs_modifier(src0_copy.modifiers, &need_abs);

    shader_arb_get_dst_param(ins, &ins->dst[0], dst);
    shader_arb_get_src_param(ins, &src0_copy, 0, src0);
    shader_arb_get_src_param(ins, &ins->src[1], 1, src1);

    if (need_abs)
        shader_addline(buffer, "ABS TA.x, %s;\n", src0);
2806 2807 2808 2809 2810 2811 2812 2813
    else
        shader_addline(buffer, "MOV TA.x, %s;\n", src0);

    if (priv->target_version >= NV2)
    {
        shader_addline(buffer, "MOVC TA.y, %s;\n", src1);
        shader_addline(buffer, "POW%s %s, TA.x, TA.y;\n", shader_arb_get_modifier(ins), dst);
        shader_addline(buffer, "MOV %s (EQ.y), %s;\n", dst, one);
2814 2815 2816
    }
    else
    {
2817 2818 2819 2820 2821 2822 2823 2824 2825 2826
        const char *zero = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_ZERO);
        const char *flt_eps = arb_get_helper_value(ins->ctx->reg_maps->shader_version.type, ARB_EPS);

        shader_addline(buffer, "ABS TA.y, %s;\n", src1);
        shader_addline(buffer, "SGE TA.y, -TA.y, %s;\n", zero);
        /* Possibly add flt_eps to avoid getting float special values */
        shader_addline(buffer, "MAD TA.z, TA.y, %s, %s;\n", flt_eps, src1);
        shader_addline(buffer, "POW%s TA.x, TA.x, TA.z;\n", shader_arb_get_modifier(ins));
        shader_addline(buffer, "MAD TA.x, -TA.x, TA.y, TA.x;\n");
        shader_addline(buffer, "MAD %s, TA.y, %s, TA.x;\n", dst, one);
2827 2828 2829
    }
}

2830 2831
static void shader_hw_loop(const struct wined3d_shader_instruction *ins)
{
2832
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847
    char src_name[50];
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    /* src0 is aL */
    shader_arb_get_src_param(ins, &ins->src[1], 0, src_name);

    if(vshader)
    {
        struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
        struct list *e = list_head(&priv->control_frames);
        struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);

        if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
        /* The constant loader makes sure to load -1 into iX.w */
        shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2848 2849
        shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->no.loop);
        shader_addline(buffer, "loop_%u_start:\n", control_frame->no.loop);
2850 2851 2852 2853 2854 2855 2856 2857 2858
    }
    else
    {
        shader_addline(buffer, "LOOP %s;\n", src_name);
    }
}

static void shader_hw_rep(const struct wined3d_shader_instruction *ins)
{
2859
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874
    char src_name[50];
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);

    /* The constant loader makes sure to load -1 into iX.w */
    if(vshader)
    {
        struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
        struct list *e = list_head(&priv->control_frames);
        struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);

        if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");

        shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2875 2876
        shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->no.loop);
        shader_addline(buffer, "loop_%u_start:\n", control_frame->no.loop);
2877 2878 2879 2880 2881 2882 2883 2884 2885
    }
    else
    {
        shader_addline(buffer, "REP %s;\n", src_name);
    }
}

static void shader_hw_endloop(const struct wined3d_shader_instruction *ins)
{
2886
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2887 2888 2889 2890 2891 2892 2893 2894 2895
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    if(vshader)
    {
        struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
        struct list *e = list_head(&priv->control_frames);
        struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);

        shader_addline(buffer, "ARAC aL.xy, aL;\n");
2896 2897
        shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->no.loop);
        shader_addline(buffer, "loop_%u_end:\n", control_frame->no.loop);
2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908

        if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
    }
    else
    {
        shader_addline(buffer, "ENDLOOP;\n");
    }
}

static void shader_hw_endrep(const struct wined3d_shader_instruction *ins)
{
2909
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2910 2911 2912 2913 2914 2915 2916 2917 2918
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    if(vshader)
    {
        struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
        struct list *e = list_head(&priv->control_frames);
        struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);

        shader_addline(buffer, "ARAC aL.xy, aL;\n");
2919 2920
        shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->no.loop);
        shader_addline(buffer, "loop_%u_end:\n", control_frame->no.loop);
2921 2922 2923 2924 2925 2926 2927 2928 2929

        if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
    }
    else
    {
        shader_addline(buffer, "ENDREP;\n");
    }
}

2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943
static const struct control_frame *find_last_loop(const struct shader_arb_ctx_priv *priv)
{
    struct control_frame *control_frame;

    LIST_FOR_EACH_ENTRY(control_frame, &priv->control_frames, struct control_frame, entry)
    {
        if(control_frame->type == LOOP || control_frame->type == REP) return control_frame;
    }
    ERR("Could not find loop for break\n");
    return NULL;
}

static void shader_hw_break(const struct wined3d_shader_instruction *ins)
{
2944
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2945 2946 2947 2948 2949
    const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    if(vshader)
    {
2950
        shader_addline(buffer, "BRA loop_%u_end;\n", control_frame->no.loop);
2951 2952 2953 2954 2955 2956 2957
    }
    else
    {
        shader_addline(buffer, "BRK;\n");
    }
}

2958
static const char *get_compare(enum wined3d_shader_rel_op op)
2959
{
2960
    switch (op)
2961
    {
2962 2963 2964 2965 2966 2967
        case WINED3D_SHADER_REL_OP_GT: return "GT";
        case WINED3D_SHADER_REL_OP_EQ: return "EQ";
        case WINED3D_SHADER_REL_OP_GE: return "GE";
        case WINED3D_SHADER_REL_OP_LT: return "LT";
        case WINED3D_SHADER_REL_OP_NE: return "NE";
        case WINED3D_SHADER_REL_OP_LE: return "LE";
2968
        default:
2969
            FIXME("Unrecognized operator %#x.\n", op);
2970 2971 2972 2973
            return "(\?\?)";
    }
}

2974
static enum wined3d_shader_rel_op invert_compare(enum wined3d_shader_rel_op op)
2975
{
2976
    switch (op)
2977
    {
2978 2979 2980 2981 2982 2983
        case WINED3D_SHADER_REL_OP_GT: return WINED3D_SHADER_REL_OP_LE;
        case WINED3D_SHADER_REL_OP_EQ: return WINED3D_SHADER_REL_OP_NE;
        case WINED3D_SHADER_REL_OP_GE: return WINED3D_SHADER_REL_OP_LT;
        case WINED3D_SHADER_REL_OP_LT: return WINED3D_SHADER_REL_OP_GE;
        case WINED3D_SHADER_REL_OP_NE: return WINED3D_SHADER_REL_OP_EQ;
        case WINED3D_SHADER_REL_OP_LE: return WINED3D_SHADER_REL_OP_GT;
2984
        default:
2985
            FIXME("Unrecognized operator %#x.\n", op);
2986 2987 2988 2989
            return -1;
    }
}

2990 2991
static void shader_hw_breakc(const struct wined3d_shader_instruction *ins)
{
2992
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2993
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2994 2995 2996
    const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
    char src_name0[50];
    char src_name1[50];
2997
    const char *comp = get_compare(ins->flags);
2998 2999 3000 3001 3002 3003 3004 3005 3006 3007

    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);

    if(vshader)
    {
        /* SUBC CC, src0, src1" works only in pixel shaders, so use TA to throw
         * away the subtraction result
         */
        shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
3008
        shader_addline(buffer, "BRA loop_%u_end (%s.x);\n", control_frame->no.loop, comp);
3009 3010 3011
    }
    else
    {
3012
        shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
3013 3014 3015 3016
        shader_addline(buffer, "BRK (%s.x);\n", comp);
    }
}

3017 3018
static void shader_hw_ifc(const struct wined3d_shader_instruction *ins)
{
3019
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    struct list *e = list_head(&priv->control_frames);
    struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
    const char *comp;
    char src_name0[50];
    char src_name1[50];
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
    shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);

    if(vshader)
    {
3033 3034
        /* Invert the flag. We jump to the else label if the condition is NOT true */
        comp = get_compare(invert_compare(ins->flags));
3035
        shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
3036
        shader_addline(buffer, "BRA ifc_%u_else (%s.x);\n", control_frame->no.ifc, comp);
3037 3038 3039
    }
    else
    {
3040
        comp = get_compare(ins->flags);
3041
        shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
3042 3043 3044 3045 3046 3047
        shader_addline(buffer, "IF %s.x;\n", comp);
    }
}

static void shader_hw_else(const struct wined3d_shader_instruction *ins)
{
3048
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3049 3050 3051 3052 3053 3054 3055
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    struct list *e = list_head(&priv->control_frames);
    struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    if(vshader)
    {
3056 3057
        shader_addline(buffer, "BRA ifc_%u_endif;\n", control_frame->no.ifc);
        shader_addline(buffer, "ifc_%u_else:\n", control_frame->no.ifc);
3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
        control_frame->had_else = TRUE;
    }
    else
    {
        shader_addline(buffer, "ELSE;\n");
    }
}

static void shader_hw_endif(const struct wined3d_shader_instruction *ins)
{
3068
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3069 3070 3071 3072 3073 3074 3075 3076 3077
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
    struct list *e = list_head(&priv->control_frames);
    struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);

    if(vshader)
    {
        if(control_frame->had_else)
        {
3078
            shader_addline(buffer, "ifc_%u_endif:\n", control_frame->no.ifc);
3079 3080 3081 3082
        }
        else
        {
            shader_addline(buffer, "#No else branch. else is endif\n");
3083
            shader_addline(buffer, "ifc_%u_else:\n", control_frame->no.ifc);
3084 3085 3086 3087 3088 3089 3090 3091
        }
    }
    else
    {
        shader_addline(buffer, "ENDIF;\n");
    }
}

3092 3093 3094 3095 3096
static void shader_hw_texldd(const struct wined3d_shader_instruction *ins)
{
    DWORD sampler_idx = ins->src[1].reg.idx;
    char reg_dest[40];
    char reg_src[3][40];
3097
    WORD flags = TEX_DERIV;
3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109

    shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
    shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src[0]);
    shader_arb_get_src_param(ins, &ins->src[2], 1, reg_src[1]);
    shader_arb_get_src_param(ins, &ins->src[3], 2, reg_src[2]);

    if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
    if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;

    shader_hw_sample(ins, sampler_idx, reg_dest, reg_src[0], flags, reg_src[1], reg_src[2]);
}

3110 3111 3112 3113 3114
static void shader_hw_texldl(const struct wined3d_shader_instruction *ins)
{
    DWORD sampler_idx = ins->src[1].reg.idx;
    char reg_dest[40];
    char reg_coord[40];
3115
    WORD flags = TEX_LOD;
3116 3117 3118 3119 3120 3121 3122 3123 3124 3125

    shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
    shader_arb_get_src_param(ins, &ins->src[0], 0, reg_coord);

    if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
    if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;

    shader_hw_sample(ins, sampler_idx, reg_dest, reg_coord, flags, NULL, NULL);
}

3126 3127
static void shader_hw_label(const struct wined3d_shader_instruction *ins)
{
3128
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3129 3130
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;

3131
    priv->in_main_func = FALSE;
3132 3133 3134 3135 3136 3137 3138 3139
    /* Call instructions activate the NV extensions, not labels and rets. If there is an uncalled
     * subroutine, don't generate a label that will make GL complain
     */
    if(priv->target_version == ARB) return;

    shader_addline(buffer, "l%u:\n", ins->src[0].reg.idx);
}

3140 3141
static void vshader_add_footer(struct shader_arb_ctx_priv *priv_ctx,
        const struct arb_vshader_private *shader_data, const struct arb_vs_compile_args *args,
3142
        const struct wined3d_shader_reg_maps *reg_maps, const struct wined3d_gl_info *gl_info,
3143
        struct wined3d_shader_buffer *buffer)
3144 3145 3146 3147 3148 3149 3150 3151
{
    unsigned int i;

    /* The D3DRS_FOGTABLEMODE render state defines if the shader-generated fog coord is used
     * or if the fragment depth is used. If the fragment depth is used(FOGTABLEMODE != NONE),
     * the fog frag coord is thrown away. If the fog frag coord is used, but not written by
     * the shader, it is set to 0.0(fully fogged, since start = 1.0, end = 0.0)
     */
3152
    if (args->super.fog_src == VS_FOG_Z)
3153
    {
3154
        shader_addline(buffer, "MOV result.fogcoord, TMP_OUT.z;\n");
3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172
    }
    else
    {
        if (!reg_maps->fog)
        {
            /* posFixup.x is always 1.0, so we can safely use it */
            shader_addline(buffer, "ADD result.fogcoord, posFixup.x, -posFixup.x;\n");
        }
        else
        {
            /* Clamp fogcoord */
            const char *zero = arb_get_helper_value(reg_maps->shader_version.type, ARB_ZERO);
            const char *one = arb_get_helper_value(reg_maps->shader_version.type, ARB_ONE);

            shader_addline(buffer, "MIN TMP_FOGCOORD.x, TMP_FOGCOORD.x, %s;\n", one);
            shader_addline(buffer, "MAX result.fogcoord.x, TMP_FOGCOORD.x, %s;\n", zero);
        }
    }
3173

3174 3175
    /* Clipplanes are always stored without y inversion */
    if (use_nv_clip(gl_info) && priv_ctx->target_version >= NV2)
3176
    {
3177
        if (args->super.clip_enabled)
3178
        {
3179
            for (i = 0; i < priv_ctx->vs_clipplanes; i++)
3180 3181 3182
            {
                shader_addline(buffer, "DP4 result.clip[%u].x, TMP_OUT, state.clip[%u].plane;\n", i, i);
            }
3183 3184
        }
    }
3185
    else if (args->clip.boolclip.clip_texcoord)
3186 3187 3188
    {
        unsigned int cur_clip = 0;
        char component[4] = {'x', 'y', 'z', 'w'};
3189
        const char *zero = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_ZERO);
3190

3191
        for (i = 0; i < gl_info->limits.clipplanes; ++i)
3192
        {
3193
            if (args->clip.boolclip.clipplane_mask & (1 << i))
3194 3195 3196 3197 3198
            {
                shader_addline(buffer, "DP4 TA.%c, TMP_OUT, state.clip[%u].plane;\n",
                               component[cur_clip++], i);
            }
        }
3199
        switch (cur_clip)
3200 3201
        {
            case 0:
3202
                shader_addline(buffer, "MOV TA, %s;\n", zero);
3203 3204
                break;
            case 1:
3205
                shader_addline(buffer, "MOV TA.yzw, %s;\n", zero);
3206 3207
                break;
            case 2:
3208
                shader_addline(buffer, "MOV TA.zw, %s;\n", zero);
3209 3210
                break;
            case 3:
3211
                shader_addline(buffer, "MOV TA.w, %s;\n", zero);
3212 3213 3214
                break;
        }
        shader_addline(buffer, "MOV result.texcoord[%u], TA;\n",
3215
                       args->clip.boolclip.clip_texcoord - 1);
3216 3217
    }

3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228
    /* Write the final position.
     *
     * OpenGL coordinates specify the center of the pixel while d3d coords specify
     * the corner. The offsets are stored in z and w in posFixup. posFixup.y contains
     * 1.0 or -1.0 to turn the rendering upside down for offscreen rendering. PosFixup.x
     * contains 1.0 to allow a mad, but arb vs swizzles are too restricted for that.
     */
    shader_addline(buffer, "MUL TA, posFixup, TMP_OUT.w;\n");
    shader_addline(buffer, "ADD TMP_OUT.x, TMP_OUT.x, TA.z;\n");
    shader_addline(buffer, "MAD TMP_OUT.y, TMP_OUT.y, posFixup.y, TA.w;\n");

3229 3230 3231
    /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection in state.c
     * and the glsl equivalent
     */
3232
    if (need_helper_const(shader_data, reg_maps, gl_info))
3233
    {
3234 3235
        const char *two = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_TWO);
        shader_addline(buffer, "MAD TMP_OUT.z, TMP_OUT.z, %s, -TMP_OUT.w;\n", two);
3236 3237 3238
    }
    else
    {
3239 3240 3241 3242 3243 3244 3245 3246 3247
        shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, TMP_OUT.z;\n");
        shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, -TMP_OUT.w;\n");
    }

    shader_addline(buffer, "MOV result.position, TMP_OUT;\n");

    priv_ctx->footer_written = TRUE;
}

3248 3249
static void shader_hw_ret(const struct wined3d_shader_instruction *ins)
{
3250
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3251
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
3252
    const struct wined3d_shader *shader = ins->ctx->shader;
3253
    BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
3254 3255 3256

    if(priv->target_version == ARB) return;

3257 3258
    if(vshader)
    {
3259
        if (priv->in_main_func) vshader_add_footer(priv, shader->backend_data,
3260
                priv->cur_vs_args, ins->ctx->reg_maps, ins->ctx->gl_info, buffer);
3261 3262
    }

3263 3264 3265 3266 3267
    shader_addline(buffer, "RET;\n");
}

static void shader_hw_call(const struct wined3d_shader_instruction *ins)
{
3268
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3269 3270 3271
    shader_addline(buffer, "CAL l%u;\n", ins->src[0].reg.idx);
}

Henri Verbeet's avatar
Henri Verbeet committed
3272
/* GL locking is done by the caller */
3273
static GLuint create_arb_blt_vertex_program(const struct wined3d_gl_info *gl_info)
3274
{
3275
    GLuint program_id = 0;
3276 3277
    GLint pos;

3278 3279 3280 3281 3282
    const char *blt_vprogram =
        "!!ARBvp1.0\n"
        "PARAM c[1] = { { 1, 0.5 } };\n"
        "MOV result.position, vertex.position;\n"
        "MOV result.color, c[0].x;\n"
3283
        "MOV result.texcoord[0], vertex.texcoord[0];\n"
3284 3285 3286 3287
        "END\n";

    GL_EXTCALL(glGenProgramsARB(1, &program_id));
    GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, program_id));
3288 3289 3290
    GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
            strlen(blt_vprogram), blt_vprogram));
    checkGLcall("glProgramStringARB()");
3291

3292 3293 3294
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
    if (pos != -1)
    {
3295
        FIXME("Vertex program error at position %d: %s\n\n", pos,
3296
            debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3297
        shader_arb_dump_program_source(blt_vprogram);
3298
    }
3299 3300 3301 3302 3303 3304 3305 3306
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");
    }
3307 3308 3309 3310

    return program_id;
}

3311
/* GL locking is done by the caller */
3312 3313
static GLuint create_arb_blt_fragment_program(const struct wined3d_gl_info *gl_info,
        enum tex_types tex_type, BOOL masked)
3314
{
3315
    GLuint program_id = 0;
3316
    const char *fprogram;
3317 3318
    GLint pos;

3319
    static const char * const blt_fprograms_full[tex_type_count] =
3320 3321 3322 3323
    {
        /* tex_1d */
        NULL,
        /* tex_2d */
3324 3325 3326 3327
        "!!ARBfp1.0\n"
        "TEMP R0;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
        "MOV result.depth.z, R0.x;\n"
3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344
        "END\n",
        /* tex_3d */
        NULL,
        /* tex_cube */
        "!!ARBfp1.0\n"
        "TEMP R0;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
        "MOV result.depth.z, R0.x;\n"
        "END\n",
        /* tex_rect */
        "!!ARBfp1.0\n"
        "TEMP R0;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
        "MOV result.depth.z, R0.x;\n"
        "END\n",
    };

3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384
    static const char * const blt_fprograms_masked[tex_type_count] =
    {
        /* tex_1d */
        NULL,
        /* tex_2d */
        "!!ARBfp1.0\n"
        "PARAM mask = program.local[0];\n"
        "TEMP R0;\n"
        "SLT R0.xy, fragment.position, mask.zwzw;\n"
        "MUL R0.x, R0.x, R0.y;\n"
        "KIL -R0.x;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
        "MOV result.depth.z, R0.x;\n"
        "END\n",
        /* tex_3d */
        NULL,
        /* tex_cube */
        "!!ARBfp1.0\n"
        "PARAM mask = program.local[0];\n"
        "TEMP R0;\n"
        "SLT R0.xy, fragment.position, mask.zwzw;\n"
        "MUL R0.x, R0.x, R0.y;\n"
        "KIL -R0.x;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
        "MOV result.depth.z, R0.x;\n"
        "END\n",
        /* tex_rect */
        "!!ARBfp1.0\n"
        "PARAM mask = program.local[0];\n"
        "TEMP R0;\n"
        "SLT R0.xy, fragment.position, mask.zwzw;\n"
        "MUL R0.x, R0.x, R0.y;\n"
        "KIL -R0.x;\n"
        "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
        "MOV result.depth.z, R0.x;\n"
        "END\n",
    };

    fprogram = masked ? blt_fprograms_masked[tex_type] : blt_fprograms_full[tex_type];
    if (!fprogram)
3385
    {
3386
        FIXME("tex_type %#x not supported, falling back to tex_2d\n", tex_type);
3387
        tex_type = tex_2d;
3388
        fprogram = masked ? blt_fprograms_masked[tex_type] : blt_fprograms_full[tex_type];
3389
    }
3390 3391 3392

    GL_EXTCALL(glGenProgramsARB(1, &program_id));
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, program_id));
3393
    GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(fprogram), fprogram));
3394
    checkGLcall("glProgramStringARB()");
3395

3396 3397 3398
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
    if (pos != -1)
    {
3399
        FIXME("Fragment program error at position %d: %s\n\n", pos,
3400
            debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3401
        shader_arb_dump_program_source(fprogram);
3402
    }
3403 3404 3405 3406 3407 3408 3409 3410
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");
    }
3411 3412 3413 3414

    return program_id;
}

3415 3416 3417
static void arbfp_add_sRGB_correction(struct wined3d_shader_buffer *buffer, const char *fragcolor,
        const char *tmp1, const char *tmp2, const char *tmp3, const char *tmp4, BOOL condcode)
{
3418 3419
    /* Perform sRGB write correction. See GLX_EXT_framebuffer_sRGB */

3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455
    if(condcode)
    {
        /* Sigh. MOVC CC doesn't work, so use one of the temps as dummy dest */
        shader_addline(buffer, "SUBC %s, %s.x, srgb_consts1.y;\n", tmp1, fragcolor);
        /* Calculate the > 0.0031308 case */
        shader_addline(buffer, "POW %s.x (GE), %s.x, srgb_consts1.z;\n", fragcolor, fragcolor);
        shader_addline(buffer, "POW %s.y (GE), %s.y, srgb_consts1.z;\n", fragcolor, fragcolor);
        shader_addline(buffer, "POW %s.z (GE), %s.z, srgb_consts1.z;\n", fragcolor, fragcolor);
        shader_addline(buffer, "MUL %s.xyz (GE), %s, srgb_consts1.w;\n", fragcolor, fragcolor);
        shader_addline(buffer, "SUB %s.xyz (GE), %s, srgb_consts2.x;\n", fragcolor, fragcolor);
        /* Calculate the < case */
        shader_addline(buffer, "MUL %s.xyz (LT), srgb_consts1.x, %s;\n", fragcolor, fragcolor);
    }
    else
    {
        /* Calculate the > 0.0031308 case */
        shader_addline(buffer, "POW %s.x, %s.x, srgb_consts1.z;\n", tmp1, fragcolor);
        shader_addline(buffer, "POW %s.y, %s.y, srgb_consts1.z;\n", tmp1, fragcolor);
        shader_addline(buffer, "POW %s.z, %s.z, srgb_consts1.z;\n", tmp1, fragcolor);
        shader_addline(buffer, "MUL %s, %s, srgb_consts1.w;\n", tmp1, tmp1);
        shader_addline(buffer, "SUB %s, %s, srgb_consts2.x;\n", tmp1, tmp1);
        /* Calculate the < case */
        shader_addline(buffer, "MUL %s, srgb_consts1.x, %s;\n", tmp2, fragcolor);
        /* Get 1.0 / 0.0 masks for > 0.0031308 and < 0.0031308 */
        shader_addline(buffer, "SLT %s, srgb_consts1.y, %s;\n", tmp3, fragcolor);
        shader_addline(buffer, "SGE %s, srgb_consts1.y, %s;\n", tmp4, fragcolor);
        /* Store the components > 0.0031308 in the destination */
        shader_addline(buffer, "MUL %s.xyz, %s, %s;\n", fragcolor, tmp1, tmp3);
        /* Add the components that are < 0.0031308 */
        shader_addline(buffer, "MAD %s.xyz, %s, %s, %s;\n", fragcolor, tmp2, tmp4, fragcolor);
        /* Move everything into result.color at once. Nvidia hardware cannot handle partial
        * result.color writes(.rgb first, then .a), or handle overwriting already written
        * components. The assembler uses a temporary register in this case, which is usually
        * not allocated from one of our registers that were used earlier.
        */
    }
3456
    /* [0.0;1.0] clamping. Not needed, this is done implicitly */
3457 3458
}

3459
static const DWORD *find_loop_control_values(const struct wined3d_shader *shader, DWORD idx)
3460
{
3461
    const struct wined3d_shader_lconst *constant;
3462

3463
    LIST_FOR_EACH_ENTRY(constant, &shader->constantsI, struct wined3d_shader_lconst, entry)
3464 3465 3466 3467 3468 3469 3470 3471 3472
    {
        if (constant->idx == idx)
        {
            return constant->value;
        }
    }
    return NULL;
}

3473
static void init_ps_input(const struct wined3d_shader *shader,
3474
        const struct arb_ps_compile_args *args, struct shader_arb_ctx_priv *priv)
3475
{
3476
    static const char * const texcoords[8] =
3477 3478 3479 3480 3481
    {
        "fragment.texcoord[0]", "fragment.texcoord[1]", "fragment.texcoord[2]", "fragment.texcoord[3]",
        "fragment.texcoord[4]", "fragment.texcoord[5]", "fragment.texcoord[6]", "fragment.texcoord[7]"
    };
    unsigned int i;
3482
    const struct wined3d_shader_signature_element *sig = shader->input_signature;
3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501
    const char *semantic_name;
    DWORD semantic_idx;

    switch(args->super.vp_mode)
    {
        case pretransformed:
        case fixedfunction:
            /* The pixelshader has to collect the varyings on its own. In any case properly load
             * color0 and color1. In the case of pretransformed vertices also load texcoords. Set
             * other attribs to 0.0.
             *
             * For fixedfunction this behavior is correct, according to the tests. For pretransformed
             * we'd either need a replacement shader that can load other attribs like BINORMAL, or
             * load the texcoord attrib pointers to match the pixel shader signature
             */
            for(i = 0; i < MAX_REG_INPUT; i++)
            {
                semantic_name = sig[i].semantic_name;
                semantic_idx = sig[i].semantic_idx;
3502
                if (!semantic_name) continue;
3503

3504
                if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_COLOR))
3505
                {
3506
                    if (!semantic_idx) priv->ps_input[i] = "fragment.color.primary";
3507 3508 3509 3510 3511 3512 3513
                    else if(semantic_idx == 1) priv->ps_input[i] = "fragment.color.secondary";
                    else priv->ps_input[i] = "0.0";
                }
                else if(args->super.vp_mode == fixedfunction)
                {
                    priv->ps_input[i] = "0.0";
                }
3514
                else if(shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_TEXCOORD))
3515 3516 3517 3518
                {
                    if(semantic_idx < 8) priv->ps_input[i] = texcoords[semantic_idx];
                    else priv->ps_input[i] = "0.0";
                }
3519
                else if(shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_FOG))
3520
                {
3521
                    if (!semantic_idx) priv->ps_input[i] = "fragment.fogcoord";
3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546
                    else priv->ps_input[i] = "0.0";
                }
                else
                {
                    priv->ps_input[i] = "0.0";
                }

                TRACE("v%u, semantic %s%u is %s\n", i, semantic_name, semantic_idx, priv->ps_input[i]);
            }
            break;

        case vertexshader:
            /* That one is easy. The vertex shaders provide v0-v7 in fragment.texcoord and v8 and v9 in
             * fragment.color
             */
            for(i = 0; i < 8; i++)
            {
                priv->ps_input[i] = texcoords[i];
            }
            priv->ps_input[8] = "fragment.color.primary";
            priv->ps_input[9] = "fragment.color.secondary";
            break;
    }
}

3547
/* GL locking is done by the caller */
3548
static GLuint shader_arb_generate_pshader(const struct wined3d_shader *shader,
3549
        const struct wined3d_gl_info *gl_info, struct wined3d_shader_buffer *buffer,
3550
        const struct arb_ps_compile_args *args, struct arb_ps_compiled_shader *compiled)
3551
{
3552
    const struct wined3d_shader_reg_maps *reg_maps = &shader->reg_maps;
3553
    const struct wined3d_shader_lconst *lconst;
3554
    const DWORD *function = shader->function;
3555
    GLuint retval;
3556
    char fragcolor[16];
3557
    DWORD *lconst_map = local_const_mapping(shader), next_local;
3558
    struct shader_arb_ctx_priv priv_ctx;
3559
    BOOL dcl_td = FALSE;
3560
    BOOL want_nv_prog = FALSE;
3561
    struct arb_pshader_private *shader_priv = shader->backend_data;
3562
    GLint errPos;
3563
    DWORD map;
3564

3565
    char srgbtmp[4][4];
3566 3567
    unsigned int i, found = 0;

3568 3569 3570
    for (i = 0, map = reg_maps->temporary; map; map >>= 1, ++i)
    {
        if (!(map & 1)
3571
                || (shader->u.ps.color0_mov && i == shader->u.ps.color0_reg)
3572
                || (reg_maps->shader_version.major < 2 && !i))
3573
            continue;
3574

3575 3576 3577
        sprintf(srgbtmp[found], "R%u", i);
        ++found;
        if (found == 4) break;
3578 3579 3580 3581 3582 3583 3584
    }

    switch(found) {
        case 0:
            sprintf(srgbtmp[0], "TA");
            sprintf(srgbtmp[1], "TB");
            sprintf(srgbtmp[2], "TC");
3585 3586
            sprintf(srgbtmp[3], "TD");
            dcl_td = TRUE;
3587 3588 3589 3590
            break;
        case 1:
            sprintf(srgbtmp[1], "TA");
            sprintf(srgbtmp[2], "TB");
3591
            sprintf(srgbtmp[3], "TC");
3592 3593 3594
            break;
        case 2:
            sprintf(srgbtmp[2], "TA");
3595 3596 3597 3598
            sprintf(srgbtmp[3], "TB");
            break;
        case 3:
            sprintf(srgbtmp[3], "TA");
3599
            break;
3600 3601
        case 4:
            break;
3602
    }
3603 3604

    /*  Create the hw ARB shader */
3605
    memset(&priv_ctx, 0, sizeof(priv_ctx));
3606
    priv_ctx.cur_ps_args = args;
3607
    priv_ctx.compiled_fprog = compiled;
3608
    priv_ctx.cur_np2fixup_info = &compiled->np2fixup_info;
3609
    init_ps_input(shader, args, &priv_ctx);
3610
    list_init(&priv_ctx.control_frames);
3611

3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622
    /* Avoid enabling NV_fragment_program* if we do not need it.
     *
     * Enabling GL_NV_fragment_program_option causes the driver to occupy a temporary register,
     * and it slows down the shader execution noticeably(about 5%). Usually our instruction emulation
     * is faster than what we gain from using higher native instructions. There are some things though
     * that cannot be emulated. In that case enable the extensions.
     * If the extension is enabled, instruction handlers that support both ways will use it.
     *
     * Testing shows no performance difference between OPTION NV_fragment_program2 and NV_fragment_program.
     * So enable the best we can get.
     */
3623
    if(reg_maps->usesdsx || reg_maps->usesdsy || reg_maps->loop_depth > 0 || reg_maps->usestexldd ||
3624
       reg_maps->usestexldl || reg_maps->usesfacing || reg_maps->usesifc || reg_maps->usescall)
3625 3626 3627 3628
    {
        want_nv_prog = TRUE;
    }

3629
    shader_addline(buffer, "!!ARBfp1.0\n");
3630 3631
    if (want_nv_prog && gl_info->supported[NV_FRAGMENT_PROGRAM2])
    {
3632 3633
        shader_addline(buffer, "OPTION NV_fragment_program2;\n");
        priv_ctx.target_version = NV3;
3634 3635 3636
    }
    else if (want_nv_prog && gl_info->supported[NV_FRAGMENT_PROGRAM_OPTION])
    {
3637 3638 3639
        shader_addline(buffer, "OPTION NV_fragment_program;\n");
        priv_ctx.target_version = NV2;
    } else {
3640 3641 3642 3643 3644 3645 3646 3647
        if(want_nv_prog)
        {
            /* This is an error - either we're advertising the wrong shader version, or aren't enforcing some
             * limits properly
             */
            ERR("The shader requires instructions that are not available in plain GL_ARB_fragment_program\n");
            ERR("Try GLSL\n");
        }
3648 3649
        priv_ctx.target_version = ARB;
    }
3650

3651
    if (reg_maps->rt_mask > 1)
3652 3653 3654 3655
    {
        shader_addline(buffer, "OPTION ARB_draw_buffers;\n");
    }

3656 3657
    if (reg_maps->shader_version.major < 3)
    {
3658
        switch(args->super.fog) {
3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672
            case FOG_OFF:
                break;
            case FOG_LINEAR:
                shader_addline(buffer, "OPTION ARB_fog_linear;\n");
                break;
            case FOG_EXP:
                shader_addline(buffer, "OPTION ARB_fog_exp;\n");
                break;
            case FOG_EXP2:
                shader_addline(buffer, "OPTION ARB_fog_exp2;\n");
                break;
        }
    }

3673 3674 3675 3676
    /* For now always declare the temps. At least the Nvidia assembler optimizes completely
     * unused temps away(but occupies them for the whole shader if they're used once). Always
     * declaring them avoids tricky bookkeeping work
     */
3677 3678 3679
    shader_addline(buffer, "TEMP TA;\n");      /* Used for modifiers */
    shader_addline(buffer, "TEMP TB;\n");      /* Used for modifiers */
    shader_addline(buffer, "TEMP TC;\n");      /* Used for modifiers */
3680
    if(dcl_td) shader_addline(buffer, "TEMP TD;\n"); /* Used for sRGB writing */
3681 3682
    shader_addline(buffer, "PARAM coefdiv = { 0.5, 0.25, 0.125, 0.0625 };\n");
    shader_addline(buffer, "PARAM coefmul = { 2, 4, 8, 16 };\n");
3683
    shader_addline(buffer, "PARAM ps_helper_const = { 0.0, 1.0, %1.10f, 0.0 };\n", eps);
3684

3685 3686
    if (reg_maps->shader_version.major < 2)
    {
3687
        strcpy(fragcolor, "R0");
3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698
    }
    else
    {
        if (args->super.srgb_correction)
        {
            if (shader->u.ps.color0_mov)
            {
                sprintf(fragcolor, "R%u", shader->u.ps.color0_reg);
            }
            else
            {
3699 3700 3701
                shader_addline(buffer, "TEMP TMP_COLOR;\n");
                strcpy(fragcolor, "TMP_COLOR");
            }
3702
        } else {
3703
            strcpy(fragcolor, "result.color");
3704
        }
3705 3706
    }

3707
    if(args->super.srgb_correction) {
3708 3709 3710 3711 3712 3713
        shader_addline(buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
                       srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
        shader_addline(buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
                       srgb_sub_high, 0.0, 0.0, 0.0);
    }

3714
    /* Base Declarations */
3715 3716
    next_local = shader_generate_arb_declarations(shader, reg_maps,
            buffer, gl_info, lconst_map, NULL, &priv_ctx);
3717

3718 3719
    for (i = 0, map = reg_maps->bumpmat; map; map >>= 1, ++i)
    {
3720 3721
        unsigned char bump_const;

3722
        if (!(map & 1)) continue;
3723

3724 3725 3726 3727 3728
        bump_const = compiled->numbumpenvmatconsts;
        compiled->bumpenvmatconst[bump_const].const_num = WINED3D_CONST_NUM_UNUSED;
        compiled->bumpenvmatconst[bump_const].texunit = i;
        compiled->luminanceconst[bump_const].const_num = WINED3D_CONST_NUM_UNUSED;
        compiled->luminanceconst[bump_const].texunit = i;
3729 3730 3731 3732 3733 3734 3735 3736 3737 3738

        /* We can fit the constants into the constant limit for sure because texbem, texbeml, bem and beml are only supported
         * in 1.x shaders, and GL_ARB_fragment_program has a constant limit of 24 constants. So in the worst case we're loading
         * 8 shader constants, 8 bump matrices and 8 luminance parameters and are perfectly fine. (No NP2 fixup on bumpmapped
         * textures due to conditional NP2 restrictions)
         *
         * Use local constants to load the bump env parameters, not program.env. This avoids collisions with d3d constants of
         * shaders in newer shader models. Since the bump env parameters have to share their space with NP2 fixup constants,
         * their location is shader dependent anyway and they cannot be loaded globally.
         */
3739
        compiled->bumpenvmatconst[bump_const].const_num = next_local++;
3740
        shader_addline(buffer, "PARAM bumpenvmat%d = program.local[%d];\n",
3741 3742
                       i, compiled->bumpenvmatconst[bump_const].const_num);
        compiled->numbumpenvmatconsts = bump_const + 1;
3743

3744
        if (!(reg_maps->luminanceparams & (1 << i))) continue;
3745

3746
        compiled->luminanceconst[bump_const].const_num = next_local++;
3747
        shader_addline(buffer, "PARAM luminance%d = program.local[%d];\n",
3748
                       i, compiled->luminanceconst[bump_const].const_num);
3749
    }
3750

3751 3752 3753 3754 3755
    for(i = 0; i < MAX_CONST_I; i++)
    {
        compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
        if (reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
        {
3756
            const DWORD *control_values = find_loop_control_values(shader, i);
3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771

            if(control_values)
            {
                shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
                                control_values[0], control_values[1], control_values[2]);
            }
            else
            {
                compiled->int_consts[i] = next_local;
                compiled->num_int_consts++;
                shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
            }
        }
    }

3772 3773 3774 3775
    if(reg_maps->vpos || reg_maps->usesdsy)
    {
        compiled->ycorrection = next_local;
        shader_addline(buffer, "PARAM ycorrection = program.local[%u];\n", next_local++);
3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787

        if(reg_maps->vpos)
        {
            shader_addline(buffer, "TEMP vpos;\n");
            /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
             * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
             * ycorrection.z: 1.0
             * ycorrection.w: 0.0
             */
            shader_addline(buffer, "MAD vpos, fragment.position, ycorrection.zyww, ycorrection.wxww;\n");
            shader_addline(buffer, "FLR vpos.xy, vpos;\n");
        }
3788 3789 3790 3791 3792 3793
    }
    else
    {
        compiled->ycorrection = WINED3D_CONST_NUM_UNUSED;
    }

3794 3795 3796 3797 3798 3799 3800
    /* Load constants to fixup NP2 texcoords if there are still free constants left:
     * Constants (texture dimensions) for the NP2 fixup are loaded as local program parameters. This will consume
     * at most 8 (MAX_FRAGMENT_SAMPLERS / 2) parameters, which is highly unlikely, since the application had to
     * use 16 NP2 textures at the same time. In case that we run out of constants the fixup is simply not
     * applied / activated. This will probably result in wrong rendering of the texture, but will save us from
     * shader compilation errors and the subsequent errors when drawing with this shader. */
    if (priv_ctx.cur_ps_args->super.np2_fixup) {
3801
        unsigned char cur_fixup_sampler = 0;
3802 3803 3804

        struct arb_ps_np2fixup_info* const fixup = priv_ctx.cur_np2fixup_info;
        const WORD map = priv_ctx.cur_ps_args->super.np2_fixup;
3805
        const UINT max_lconsts = gl_info->limits.arb_ps_local_constants;
3806 3807 3808 3809 3810 3811 3812

        fixup->offset = next_local;
        fixup->super.active = 0;

        for (i = 0; i < MAX_FRAGMENT_SAMPLERS; ++i) {
            if (!(map & (1 << i))) continue;

3813
            if (fixup->offset + (cur_fixup_sampler >> 1) < max_lconsts) {
3814
                fixup->super.active |= (1 << i);
3815
                fixup->super.idx[i] = cur_fixup_sampler++;
3816 3817 3818 3819 3820 3821 3822
            } else {
                FIXME("No free constant found to load NP2 fixup data into shader. "
                      "Sampling from this texture will probably look wrong.\n");
                break;
            }
        }

3823
        fixup->super.num_consts = (cur_fixup_sampler + 1) >> 1;
3824 3825 3826 3827 3828 3829
        if (fixup->super.num_consts) {
            shader_addline(buffer, "PARAM np2fixup[%u] = { program.env[%u..%u] };\n",
                           fixup->super.num_consts, fixup->offset, fixup->super.num_consts + fixup->offset - 1);
        }
    }

3830
    if (shader_priv->clipplane_emulation != ~0U && args->clip)
3831
    {
3832
        shader_addline(buffer, "KIL fragment.texcoord[%u];\n", shader_priv->clipplane_emulation);
3833 3834
    }

3835
    /* Base Shader Body */
3836
    shader_generate_main(shader, buffer, reg_maps, function, &priv_ctx);
3837

3838
    if(args->super.srgb_correction) {
3839 3840
        arbfp_add_sRGB_correction(buffer, fragcolor, srgbtmp[0], srgbtmp[1], srgbtmp[2], srgbtmp[3],
                                  priv_ctx.target_version >= NV2);
3841 3842 3843
    }

    if(strcmp(fragcolor, "result.color")) {
3844
        shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
3845 3846 3847 3848
    }
    shader_addline(buffer, "END\n");

    /* TODO: change to resource.glObjectHandle or something like that */
3849
    GL_EXTCALL(glGenProgramsARB(1, &retval));
3850

3851 3852
    TRACE("Creating a hw pixel shader, prg=%d\n", retval);
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, retval));
3853

3854
    TRACE("Created hw pixel shader, prg=%d\n", retval);
3855 3856 3857
    /* Create the program and check for errors */
    GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
               buffer->bsize, buffer->buffer));
3858
    checkGLcall("glProgramStringARB()");
3859

3860 3861 3862
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
    if (errPos != -1)
    {
3863
        FIXME("HW PixelShader Error at position %d: %s\n\n",
3864
              errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3865
        shader_arb_dump_program_source(buffer->buffer);
3866
        retval = 0;
3867
    }
3868 3869 3870 3871 3872 3873 3874 3875
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");
    }
3876 3877

    /* Load immediate constants */
3878 3879
    if (lconst_map)
    {
3880
        LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
3881
        {
3882
            const float *value = (const float *)lconst->value;
3883
            GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, lconst_map[lconst->idx], value));
3884 3885
            checkGLcall("glProgramLocalParameter4fvARB");
        }
3886
        HeapFree(GetProcessHeap(), 0, lconst_map);
3887
    }
3888 3889

    return retval;
3890 3891
}

3892 3893 3894 3895 3896 3897 3898
static int compare_sig(const struct wined3d_shader_signature_element *sig1, const struct wined3d_shader_signature_element *sig2)
{
    unsigned int i;
    int ret;

    for(i = 0; i < MAX_REG_INPUT; i++)
    {
3899
        if (!sig1[i].semantic_name || !sig2[i].semantic_name)
3900 3901 3902 3903 3904 3905
        {
            /* Compare pointers, not contents. One string is NULL(element does not exist), the other one is not NULL */
            if(sig1[i].semantic_name != sig2[i].semantic_name) return sig1[i].semantic_name < sig2[i].semantic_name ? -1 : 1;
            continue;
        }

3906
        if ((ret = strcmp(sig1[i].semantic_name, sig2[i].semantic_name))) return ret;
3907 3908
        if(sig1[i].semantic_idx    != sig2[i].semantic_idx)    return sig1[i].semantic_idx    < sig2[i].semantic_idx    ? -1 : 1;
        if(sig1[i].sysval_semantic != sig2[i].sysval_semantic) return sig1[i].sysval_semantic < sig2[i].sysval_semantic ? -1 : 1;
3909
        if(sig1[i].component_type  != sig2[i].component_type)  return sig1[i].component_type  < sig2[i].component_type  ? -1 : 1;
3910
        if(sig1[i].register_idx    != sig2[i].register_idx)    return sig1[i].register_idx    < sig2[i].register_idx    ? -1 : 1;
3911
        if(sig1[i].mask            != sig2[i].mask)            return sig1[i].mask            < sig2[i].mask            ? -1 : 1;
3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924
    }
    return 0;
}

static struct wined3d_shader_signature_element *clone_sig(const struct wined3d_shader_signature_element *sig)
{
    struct wined3d_shader_signature_element *new;
    int i;
    char *name;

    new = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*new) * MAX_REG_INPUT);
    for(i = 0; i < MAX_REG_INPUT; i++)
    {
3925
        if (!sig[i].semantic_name) continue;
3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940

        new[i] = sig[i];
        /* Clone the semantic string */
        name = HeapAlloc(GetProcessHeap(), 0, strlen(sig[i].semantic_name) + 1);
        strcpy(name, sig[i].semantic_name);
        new[i].semantic_name = name;
    }
    return new;
}

static DWORD find_input_signature(struct shader_arb_priv *priv, const struct wined3d_shader_signature_element *sig)
{
    struct wine_rb_entry *entry = wine_rb_get(&priv->signature_tree, sig);
    struct ps_signature *found_sig;

3941
    if (entry)
3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957
    {
        found_sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
        TRACE("Found existing signature %u\n", found_sig->idx);
        return found_sig->idx;
    }
    found_sig = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*sig));
    found_sig->sig = clone_sig(sig);
    found_sig->idx = priv->ps_sig_number++;
    TRACE("New signature stored and assigned number %u\n", found_sig->idx);
    if(wine_rb_put(&priv->signature_tree, sig, &found_sig->entry) == -1)
    {
        ERR("Failed to insert program entry.\n");
    }
    return found_sig->idx;
}

3958
static void init_output_registers(const struct wined3d_shader *shader, DWORD sig_num,
3959
        struct shader_arb_ctx_priv *priv_ctx, struct arb_vs_compiled_shader *compiled)
3960 3961
{
    unsigned int i, j;
3962
    static const char * const texcoords[8] =
3963 3964 3965 3966
    {
        "result.texcoord[0]", "result.texcoord[1]", "result.texcoord[2]", "result.texcoord[3]",
        "result.texcoord[4]", "result.texcoord[5]", "result.texcoord[6]", "result.texcoord[7]"
    };
3967
    struct wined3d_device *device = shader->device;
3968 3969 3970 3971 3972 3973 3974
    const struct wined3d_shader_signature_element *sig;
    const char *semantic_name;
    DWORD semantic_idx, reg_idx;

    /* Write generic input varyings 0 to 7 to result.texcoord[], varying 8 to result.color.primary
     * and varying 9 to result.color.secondary
     */
3975
    static const char * const decl_idx_to_string[MAX_REG_INPUT] =
3976
    {
3977 3978
        "result.texcoord[0]", "result.texcoord[1]", "result.texcoord[2]", "result.texcoord[3]",
        "result.texcoord[4]", "result.texcoord[5]", "result.texcoord[6]", "result.texcoord[7]",
3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991
        "result.color.primary", "result.color.secondary"
    };

    if(sig_num == ~0)
    {
        TRACE("Pixel shader uses builtin varyings\n");
        /* Map builtins to builtins */
        for(i = 0; i < 8; i++)
        {
            priv_ctx->texcrd_output[i] = texcoords[i];
        }
        priv_ctx->color_output[0] = "result.color.primary";
        priv_ctx->color_output[1] = "result.color.secondary";
3992
        priv_ctx->fog_output = "TMP_FOGCOORD";
3993 3994

        /* Map declared regs to builtins. Use "TA" to /dev/null unread output */
3995
        for (i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); ++i)
3996
        {
3997
            semantic_name = shader->output_signature[i].semantic_name;
3998
            if (!semantic_name) continue;
3999

4000
            if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_POSITION))
4001 4002
            {
                TRACE("o%u is TMP_OUT\n", i);
4003
                if (!shader->output_signature[i].semantic_idx) priv_ctx->vs_output[i] = "TMP_OUT";
4004 4005
                else priv_ctx->vs_output[i] = "TA";
            }
4006
            else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_PSIZE))
4007 4008
            {
                TRACE("o%u is result.pointsize\n", i);
4009
                if (!shader->output_signature[i].semantic_idx) priv_ctx->vs_output[i] = "result.pointsize";
4010 4011
                else priv_ctx->vs_output[i] = "TA";
            }
4012
            else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_COLOR))
4013
            {
4014 4015
                TRACE("o%u is result.color.?, idx %u\n", i, shader->output_signature[i].semantic_idx);
                if (!shader->output_signature[i].semantic_idx)
4016
                    priv_ctx->vs_output[i] = "result.color.primary";
4017
                else if (shader->output_signature[i].semantic_idx == 1)
4018
                    priv_ctx->vs_output[i] = "result.color.secondary";
4019 4020
                else priv_ctx->vs_output[i] = "TA";
            }
4021
            else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_TEXCOORD))
4022
            {
4023 4024 4025
                TRACE("o%u is %s\n", i, texcoords[shader->output_signature[i].semantic_idx]);
                if (shader->output_signature[i].semantic_idx >= 8) priv_ctx->vs_output[i] = "TA";
                else priv_ctx->vs_output[i] = texcoords[shader->output_signature[i].semantic_idx];
4026
            }
4027
            else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_FOG))
4028 4029
            {
                TRACE("o%u is result.fogcoord\n", i);
4030
                if (shader->output_signature[i].semantic_idx > 0) priv_ctx->vs_output[i] = "TA";
4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043
                else priv_ctx->vs_output[i] = "result.fogcoord";
            }
            else
            {
                priv_ctx->vs_output[i] = "TA";
            }
        }
        return;
    }

    /* Instead of searching for the signature in the signature list, read the one from the current pixel shader.
     * Its maybe not the shader where the signature came from, but it is the same signature and faster to find
     */
4044
    sig = device->stateBlock->state.pixel_shader->input_signature;
4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060
    TRACE("Pixel shader uses declared varyings\n");

    /* Map builtin to declared. /dev/null the results by default to the TA temp reg */
    for(i = 0; i < 8; i++)
    {
        priv_ctx->texcrd_output[i] = "TA";
    }
    priv_ctx->color_output[0] = "TA";
    priv_ctx->color_output[1] = "TA";
    priv_ctx->fog_output = "TA";

    for(i = 0; i < MAX_REG_INPUT; i++)
    {
        semantic_name = sig[i].semantic_name;
        semantic_idx = sig[i].semantic_idx;
        reg_idx = sig[i].register_idx;
4061
        if (!semantic_name) continue;
4062 4063 4064 4065 4066 4067 4068

        /* If a declared input register is not written by builtin arguments, don't write to it.
         * GL_NV_vertex_program makes sure the input defaults to 0.0, which is correct with D3D
         *
         * Don't care about POSITION and PSIZE here - this is a builtin vertex shader, position goes
         * to TMP_OUT in any case
         */
4069
        if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_TEXCOORD))
4070
        {
4071 4072
            if (semantic_idx < 8)
                priv_ctx->texcrd_output[semantic_idx] = decl_idx_to_string[reg_idx];
4073
        }
4074
        else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_COLOR))
4075
        {
4076 4077
            if (semantic_idx < 2)
                priv_ctx->color_output[semantic_idx] = decl_idx_to_string[reg_idx];
4078
        }
4079
        else if(shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_FOG))
4080
        {
4081 4082
            if (!semantic_idx)
                priv_ctx->fog_output = decl_idx_to_string[reg_idx];
4083
        }
4084 4085 4086 4087 4088
        else
        {
            continue;
        }

4089 4090
        if (!strcmp(decl_idx_to_string[reg_idx], "result.color.primary")
                || !strcmp(decl_idx_to_string[reg_idx], "result.color.secondary"))
4091 4092 4093
        {
            compiled->need_color_unclamp = TRUE;
        }
4094 4095 4096
    }

    /* Map declared to declared */
4097
    for (i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); ++i)
4098 4099 4100
    {
        /* Write unread output to TA to throw them away */
        priv_ctx->vs_output[i] = "TA";
4101
        semantic_name = shader->output_signature[i].semantic_name;
4102
        if (!semantic_name) continue;
4103

4104
        if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_POSITION)
4105
                && !shader->output_signature[i].semantic_idx)
4106 4107 4108 4109
        {
            priv_ctx->vs_output[i] = "TMP_OUT";
            continue;
        }
4110
        else if (shader_match_semantic(semantic_name, WINED3D_DECL_USAGE_PSIZE)
4111
                && !shader->output_signature[i].semantic_idx)
4112 4113 4114 4115 4116 4117 4118
        {
            priv_ctx->vs_output[i] = "result.pointsize";
            continue;
        }

        for(j = 0; j < MAX_REG_INPUT; j++)
        {
4119
            if (!sig[j].semantic_name) continue;
4120

4121
            if (!strcmp(sig[j].semantic_name, semantic_name)
4122
                    && sig[j].semantic_idx == shader->output_signature[i].semantic_idx)
4123 4124
            {
                priv_ctx->vs_output[i] = decl_idx_to_string[sig[j].register_idx];
4125

4126 4127
                if (!strcmp(priv_ctx->vs_output[i], "result.color.primary")
                        || !strcmp(priv_ctx->vs_output[i], "result.color.secondary"))
4128 4129 4130
                {
                    compiled->need_color_unclamp = TRUE;
                }
4131 4132 4133 4134 4135
            }
        }
    }
}

4136
/* GL locking is done by the caller */
4137
static GLuint shader_arb_generate_vshader(const struct wined3d_shader *shader,
4138
        const struct wined3d_gl_info *gl_info, struct wined3d_shader_buffer *buffer,
4139
        const struct arb_vs_compile_args *args, struct arb_vs_compiled_shader *compiled)
4140
{
4141 4142
    const struct arb_vshader_private *shader_data = shader->backend_data;
    const struct wined3d_shader_reg_maps *reg_maps = &shader->reg_maps;
4143
    const struct wined3d_shader_lconst *lconst;
4144
    const DWORD *function = shader->function;
4145
    GLuint ret;
4146
    DWORD next_local, *lconst_map = local_const_mapping(shader);
4147
    struct shader_arb_ctx_priv priv_ctx;
4148
    unsigned int i;
4149
    GLint errPos;
4150

4151
    memset(&priv_ctx, 0, sizeof(priv_ctx));
4152
    priv_ctx.cur_vs_args = args;
4153
    list_init(&priv_ctx.control_frames);
4154
    init_output_registers(shader, args->ps_signature, &priv_ctx, compiled);
4155

4156 4157
    /*  Create the hw ARB shader */
    shader_addline(buffer, "!!ARBvp1.0\n");
4158

4159 4160 4161
    /* Always enable the NV extension if available. Unlike fragment shaders, there is no
     * mesurable performance penalty, and we can always make use of it for clipplanes.
     */
4162 4163
    if (gl_info->supported[NV_VERTEX_PROGRAM3])
    {
4164 4165 4166
        shader_addline(buffer, "OPTION NV_vertex_program3;\n");
        priv_ctx.target_version = NV3;
        shader_addline(buffer, "ADDRESS aL;\n");
4167 4168 4169
    }
    else if (gl_info->supported[NV_VERTEX_PROGRAM2_OPTION])
    {
4170 4171
        shader_addline(buffer, "OPTION NV_vertex_program2;\n");
        priv_ctx.target_version = NV2;
4172
        shader_addline(buffer, "ADDRESS aL;\n");
4173 4174 4175 4176
    } else {
        priv_ctx.target_version = ARB;
    }

4177
    shader_addline(buffer, "TEMP TMP_OUT;\n");
4178 4179
    if (reg_maps->fog)
        shader_addline(buffer, "TEMP TMP_FOGCOORD;\n");
4180
    if (need_helper_const(shader_data, reg_maps, gl_info))
4181
    {
4182
        shader_addline(buffer, "PARAM helper_const = { 0.0, 1.0, 2.0, %1.10f};\n", eps);
4183
    }
4184
    if (need_rel_addr_const(shader_data, reg_maps, gl_info))
4185 4186
    {
        shader_addline(buffer, "PARAM rel_addr_const = { 0.5, %d.0, 0.0, 0.0 };\n", shader_data->rel_offset);
4187
        shader_addline(buffer, "TEMP A0_SHADOW;\n");
4188
    }
4189

4190
    shader_addline(buffer, "TEMP TA;\n");
4191
    shader_addline(buffer, "TEMP TB;\n");
4192 4193

    /* Base Declarations */
4194 4195
    next_local = shader_generate_arb_declarations(shader, reg_maps, buffer,
            gl_info, lconst_map, &priv_ctx.vs_clipplanes, &priv_ctx);
4196 4197 4198 4199 4200 4201

    for(i = 0; i < MAX_CONST_I; i++)
    {
        compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
        if(reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
        {
4202
            const DWORD *control_values = find_loop_control_values(shader, i);
4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216

            if(control_values)
            {
                shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
                                control_values[0], control_values[1], control_values[2]);
            }
            else
            {
                compiled->int_consts[i] = next_local;
                compiled->num_int_consts++;
                shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
            }
        }
    }
4217 4218

    /* We need a constant to fixup the final position */
4219 4220
    shader_addline(buffer, "PARAM posFixup = program.local[%u];\n", next_local);
    compiled->pos_fixup = next_local++;
4221

4222 4223 4224
    /* Initialize output parameters. GL_ARB_vertex_program does not require special initialization values
     * for output parameters. D3D in theory does not do that either, but some applications depend on a
     * proper initialization of the secondary color, and programs using the fixed function pipeline without
4225
     * a replacement shader depend on the texcoord.w being set properly.
4226 4227
     *
     * GL_NV_vertex_program defines that all output values are initialized to {0.0, 0.0, 0.0, 1.0}. This
Austin English's avatar
Austin English committed
4228
     * assertion is in effect even when using GL_ARB_vertex_program without any NV specific additions. So
4229
     * skip this if NV_vertex_program is supported. Otherwise, initialize the secondary color. For the tex-
4230
     * coords, we have a flag in the opengl caps. Many cards do not require the texcoord being set, and
4231 4232
     * this can eat a number of instructions, so skip it unless this cap is set as well
     */
4233 4234
    if (!gl_info->supported[NV_VERTEX_PROGRAM])
    {
4235
        struct wined3d_device *device = shader->device;
4236 4237
        const char *color_init = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_0001);
        shader_addline(buffer, "MOV result.color.secondary, %s;\n", color_init);
4238

4239
        if (gl_info->quirks & WINED3D_QUIRK_SET_TEXCOORD_W && !device->frag_pipe->ffp_proj_control)
4240
        {
4241
            int i;
4242 4243 4244
            const char *one = arb_get_helper_value(WINED3D_SHADER_TYPE_VERTEX, ARB_ONE);
            for(i = 0; i < min(8, MAX_REG_TEXCRD); i++)
            {
4245
                if (reg_maps->texcoord_mask[i] && reg_maps->texcoord_mask[i] != WINED3DSP_WRITEMASK_ALL)
4246
                    shader_addline(buffer, "MOV result.texcoord[%u].w, %s\n", i, one);
4247 4248 4249 4250
            }
        }
    }

4251 4252
    /* The shader starts with the main function */
    priv_ctx.in_main_func = TRUE;
4253
    /* Base Shader Body */
4254
    shader_generate_main(shader, buffer, reg_maps, function, &priv_ctx);
4255

4256 4257
    if (!priv_ctx.footer_written) vshader_add_footer(&priv_ctx,
            shader_data, args, reg_maps, gl_info, buffer);
4258 4259 4260 4261

    shader_addline(buffer, "END\n");

    /* TODO: change to resource.glObjectHandle or something like that */
4262
    GL_EXTCALL(glGenProgramsARB(1, &ret));
4263

4264 4265
    TRACE("Creating a hw vertex shader, prg=%d\n", ret);
    GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, ret));
4266

4267
    TRACE("Created hw vertex shader, prg=%d\n", ret);
4268 4269 4270
    /* Create the program and check for errors */
    GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
               buffer->bsize, buffer->buffer));
4271
    checkGLcall("glProgramStringARB()");
4272

4273 4274 4275
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
    if (errPos != -1)
    {
4276
        FIXME("HW VertexShader Error at position %d: %s\n\n",
4277
              errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
4278
        shader_arb_dump_program_source(buffer->buffer);
4279
        ret = -1;
4280 4281 4282 4283 4284 4285 4286 4287 4288
    }
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");

4289
        /* Load immediate constants */
4290 4291
        if (lconst_map)
        {
4292
            LIST_FOR_EACH_ENTRY(lconst, &shader->constantsF, struct wined3d_shader_lconst, entry)
4293
            {
4294
                const float *value = (const float *)lconst->value;
4295
                GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, lconst_map[lconst->idx], value));
4296
            }
4297 4298
        }
    }
4299 4300
    HeapFree(GetProcessHeap(), 0, lconst_map);

4301
    return ret;
4302 4303
}

4304
/* GL locking is done by the caller */
4305
static struct arb_ps_compiled_shader *find_arb_pshader(struct wined3d_shader *shader,
4306
        const struct arb_ps_compile_args *args)
4307
{
4308
    struct wined3d_device *device = shader->device;
4309
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
4310 4311
    UINT i;
    DWORD new_size;
4312
    struct arb_ps_compiled_shader *new_array;
4313
    struct wined3d_shader_buffer buffer;
4314 4315 4316
    struct arb_pshader_private *shader_data;
    GLuint ret;

4317
    if (!shader->backend_data)
4318
    {
4319 4320
        struct shader_arb_priv *priv = device->shader_priv;

4321 4322 4323
        shader->backend_data = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
        shader_data = shader->backend_data;
        shader_data->clamp_consts = shader->reg_maps.shader_version.major == 1;
4324

4325 4326 4327 4328
        if (shader->reg_maps.shader_version.major < 3)
            shader_data->input_signature_idx = ~0;
        else
            shader_data->input_signature_idx = find_input_signature(priv, shader->input_signature);
4329 4330 4331

        TRACE("Shader got assigned input signature index %u\n", shader_data->input_signature_idx);

4332
        if (!device->vs_clipping)
4333
            shader_data->clipplane_emulation = shader_find_free_input_register(&shader->reg_maps,
4334
                    gl_info->limits.texture_stages - 1);
4335 4336
        else
            shader_data->clipplane_emulation = ~0U;
4337
    }
4338
    shader_data = shader->backend_data;
4339 4340 4341 4342 4343

    /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
     * so a linear search is more performant than a hashmap or a binary search
     * (cache coherency etc)
     */
4344 4345 4346
    for (i = 0; i < shader_data->num_gl_shaders; ++i)
    {
        if (!memcmp(&shader_data->gl_shaders[i].args, args, sizeof(*args)))
4347
            return &shader_data->gl_shaders[i];
4348 4349 4350
    }

    TRACE("No matching GL shader found, compiling a new shader\n");
4351 4352
    if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
        if (shader_data->num_gl_shaders)
4353
        {
4354
            new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
4355
            new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
4356
                                    new_size * sizeof(*shader_data->gl_shaders));
4357
        } else {
4358
            new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
4359 4360 4361 4362 4363 4364 4365
            new_size = 1;
        }

        if(!new_array) {
            ERR("Out of memory\n");
            return 0;
        }
4366 4367
        shader_data->gl_shaders = new_array;
        shader_data->shader_array_size = new_size;
4368 4369
    }

4370
    shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
4371

4372
    pixelshader_update_samplers(&shader->reg_maps, device->stateBlock->state.textures);
4373

4374 4375 4376 4377 4378 4379
    if (!shader_buffer_init(&buffer))
    {
        ERR("Failed to initialize shader buffer.\n");
        return 0;
    }

4380 4381
    ret = shader_arb_generate_pshader(shader, gl_info, &buffer, args,
            &shader_data->gl_shaders[shader_data->num_gl_shaders]);
4382
    shader_buffer_free(&buffer);
4383
    shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
4384

4385
    return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
4386 4387
}

4388
static inline BOOL vs_args_equal(const struct arb_vs_compile_args *stored, const struct arb_vs_compile_args *new,
4389
                                 const DWORD use_map, BOOL skip_int) {
4390
    if((stored->super.swizzle_map & use_map) != new->super.swizzle_map) return FALSE;
4391
    if(stored->super.clip_enabled != new->super.clip_enabled) return FALSE;
4392
    if(stored->super.fog_src != new->super.fog_src) return FALSE;
4393
    if(stored->clip.boolclip_compare != new->clip.boolclip_compare) return FALSE;
4394
    if(stored->ps_signature != new->ps_signature) return FALSE;
4395
    if(stored->vertex.samplers_compare != new->vertex.samplers_compare) return FALSE;
4396 4397
    if(skip_int) return TRUE;

4398
    return !memcmp(stored->loop_ctrl, new->loop_ctrl, sizeof(stored->loop_ctrl));
4399 4400
}

4401
static struct arb_vs_compiled_shader *find_arb_vshader(struct wined3d_shader *shader,
4402
        const struct arb_vs_compile_args *args)
4403
{
4404
    struct wined3d_device *device = shader->device;
4405 4406
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
    DWORD use_map = device->strided_streams.use_map;
4407
    UINT i;
4408 4409
    DWORD new_size;
    struct arb_vs_compiled_shader *new_array;
4410
    struct wined3d_shader_buffer buffer;
4411
    struct arb_vshader_private *shader_data;
4412 4413
    GLuint ret;

4414
    if (!shader->backend_data)
4415
    {
4416
        const struct wined3d_shader_reg_maps *reg_maps = &shader->reg_maps;
4417

4418 4419
        shader->backend_data = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
        shader_data = shader->backend_data;
4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434

        if ((gl_info->quirks & WINED3D_QUIRK_ARB_VS_OFFSET_LIMIT)
                && reg_maps->min_rel_offset <= reg_maps->max_rel_offset)
        {
            if (reg_maps->max_rel_offset - reg_maps->min_rel_offset > 127)
            {
                FIXME("The difference between the minimum and maximum relative offset is > 127.\n");
                FIXME("Which this OpenGL implementation does not support. Try using GLSL.\n");
                FIXME("Min: %u, Max: %u.\n", reg_maps->min_rel_offset, reg_maps->max_rel_offset);
            }
            else if (reg_maps->max_rel_offset - reg_maps->min_rel_offset > 63)
                shader_data->rel_offset = reg_maps->min_rel_offset + 63;
            else if (reg_maps->max_rel_offset > 63)
                shader_data->rel_offset = reg_maps->min_rel_offset;
        }
4435
    }
4436
    shader_data = shader->backend_data;
4437

4438 4439 4440 4441
    /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
     * so a linear search is more performant than a hashmap or a binary search
     * (cache coherency etc)
     */
4442
    for(i = 0; i < shader_data->num_gl_shaders; i++) {
4443 4444 4445
        if (vs_args_equal(&shader_data->gl_shaders[i].args, args,
                use_map, gl_info->supported[NV_VERTEX_PROGRAM2_OPTION]))
        {
4446
            return &shader_data->gl_shaders[i];
4447 4448 4449 4450 4451
        }
    }

    TRACE("No matching GL shader found, compiling a new shader\n");

4452 4453
    if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
        if (shader_data->num_gl_shaders)
4454
        {
4455
            new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
4456
            new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
4457
                                    new_size * sizeof(*shader_data->gl_shaders));
4458
        } else {
4459
            new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
4460 4461 4462 4463 4464 4465 4466
            new_size = 1;
        }

        if(!new_array) {
            ERR("Out of memory\n");
            return 0;
        }
4467 4468
        shader_data->gl_shaders = new_array;
        shader_data->shader_array_size = new_size;
4469 4470
    }

4471
    shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
4472

4473 4474 4475 4476 4477 4478
    if (!shader_buffer_init(&buffer))
    {
        ERR("Failed to initialize shader buffer.\n");
        return 0;
    }

4479
    ret = shader_arb_generate_vshader(shader, gl_info, &buffer, args,
4480
            &shader_data->gl_shaders[shader_data->num_gl_shaders]);
4481
    shader_buffer_free(&buffer);
4482
    shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
4483

4484
    return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
4485 4486
}

4487
static void find_arb_ps_compile_args(const struct wined3d_state *state,
4488
        const struct wined3d_shader *shader, struct arb_ps_compile_args *args)
4489
{
4490
    struct wined3d_device *device = shader->device;
4491
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
4492
    int i;
4493
    WORD int_skip;
4494

4495
    find_ps_compile_args(state, shader, &args->super);
4496 4497

    /* This forces all local boolean constants to 1 to make them stateblock independent */
4498
    args->bools = shader->reg_maps.local_bool_consts;
4499 4500 4501

    for(i = 0; i < MAX_CONST_B; i++)
    {
4502
        if (state->ps_consts_b[i])
4503
            args->bools |= ( 1 << i);
4504
    }
4505

4506 4507 4508 4509
    /* Only enable the clip plane emulation KIL if at least one clipplane is enabled. The KIL instruction
     * is quite expensive because it forces the driver to disable early Z discards. It is cheaper to
     * duplicate the shader than have a no-op KIL instruction in every shader
     */
4510
    if (!device->vs_clipping && use_vs(state)
4511 4512
            && state->render_states[WINED3D_RS_CLIPPING]
            && state->render_states[WINED3D_RS_CLIPPLANEENABLE])
4513 4514 4515 4516
        args->clip = 1;
    else
        args->clip = 0;

4517
    /* Skip if unused or local, or supported natively */
4518
    int_skip = ~shader->reg_maps.integer_constants | shader->reg_maps.local_int_consts;
4519
    if (int_skip == 0xffff || gl_info->supported[NV_FRAGMENT_PROGRAM_OPTION])
4520
    {
4521
        memset(args->loop_ctrl, 0, sizeof(args->loop_ctrl));
4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534
        return;
    }

    for(i = 0; i < MAX_CONST_I; i++)
    {
        if(int_skip & (1 << i))
        {
            args->loop_ctrl[i][0] = 0;
            args->loop_ctrl[i][1] = 0;
            args->loop_ctrl[i][2] = 0;
        }
        else
        {
4535 4536 4537
            args->loop_ctrl[i][0] = state->ps_consts_i[i * 4];
            args->loop_ctrl[i][1] = state->ps_consts_i[i * 4 + 1];
            args->loop_ctrl[i][2] = state->ps_consts_i[i * 4 + 2];
4538 4539
        }
    }
4540 4541
}

4542
static void find_arb_vs_compile_args(const struct wined3d_state *state,
4543
        const struct wined3d_shader *shader, struct arb_vs_compile_args *args)
4544
{
4545
    struct wined3d_device *device = shader->device;
4546
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
4547
    int i;
4548
    WORD int_skip;
4549

4550
    find_vs_compile_args(state, shader, &args->super);
4551

4552
    args->clip.boolclip_compare = 0;
4553
    if (use_ps(state))
4554
    {
4555 4556
        const struct wined3d_shader *ps = state->pixel_shader;
        const struct arb_pshader_private *shader_priv = ps->backend_data;
4557
        args->ps_signature = shader_priv->input_signature_idx;
4558

4559
        args->clip.boolclip.clip_texcoord = shader_priv->clipplane_emulation + 1;
4560 4561 4562 4563
    }
    else
    {
        args->ps_signature = ~0;
4564
        if (!device->vs_clipping && device->adapter->fragment_pipe == &arbfp_fragment_pipeline)
4565
        {
4566
            args->clip.boolclip.clip_texcoord = ffp_clip_emul(state) ? gl_info->limits.texture_stages : 0;
4567
        }
4568
        /* Otherwise: Setting boolclip_compare set clip_texcoord to 0 */
4569 4570
    }

4571
    if (args->clip.boolclip.clip_texcoord)
4572
    {
4573 4574
        if (state->render_states[WINED3D_RS_CLIPPING])
            args->clip.boolclip.clipplane_mask = (unsigned char)state->render_states[WINED3D_RS_CLIPPLANEENABLE];
4575
        /* clipplane_mask was set to 0 by setting boolclip_compare to 0 */
4576 4577
    }

4578
    /* This forces all local boolean constants to 1 to make them stateblock independent */
4579
    args->clip.boolclip.bools = shader->reg_maps.local_bool_consts;
4580 4581 4582
    /* TODO: Figure out if it would be better to store bool constants as bitmasks in the stateblock */
    for(i = 0; i < MAX_CONST_B; i++)
    {
4583
        if (state->vs_consts_b[i])
4584
            args->clip.boolclip.bools |= ( 1 << i);
4585 4586
    }

4587 4588 4589
    args->vertex.samplers[0] = device->texUnitMap[MAX_FRAGMENT_SAMPLERS + 0];
    args->vertex.samplers[1] = device->texUnitMap[MAX_FRAGMENT_SAMPLERS + 1];
    args->vertex.samplers[2] = device->texUnitMap[MAX_FRAGMENT_SAMPLERS + 2];
4590
    args->vertex.samplers[3] = 0;
4591

4592
    /* Skip if unused or local */
4593
    int_skip = ~shader->reg_maps.integer_constants | shader->reg_maps.local_int_consts;
4594 4595
    /* This is about flow control, not clipping. */
    if (int_skip == 0xffff || gl_info->supported[NV_VERTEX_PROGRAM2_OPTION])
4596
    {
4597
        memset(args->loop_ctrl, 0, sizeof(args->loop_ctrl));
4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610
        return;
    }

    for(i = 0; i < MAX_CONST_I; i++)
    {
        if(int_skip & (1 << i))
        {
            args->loop_ctrl[i][0] = 0;
            args->loop_ctrl[i][1] = 0;
            args->loop_ctrl[i][2] = 0;
        }
        else
        {
4611 4612 4613
            args->loop_ctrl[i][0] = state->vs_consts_i[i * 4];
            args->loop_ctrl[i][1] = state->vs_consts_i[i * 4 + 1];
            args->loop_ctrl[i][2] = state->vs_consts_i[i * 4 + 2];
4614 4615
        }
    }
4616 4617
}

4618
/* GL locking is done by the caller */
4619 4620
static void shader_arb_select(const struct wined3d_context *context, BOOL usePS, BOOL useVS)
{
4621 4622
    struct wined3d_device *device = context->swapchain->device;
    struct shader_arb_priv *priv = device->shader_priv;
4623
    const struct wined3d_gl_info *gl_info = context->gl_info;
4624
    const struct wined3d_state *state = &device->stateBlock->state;
4625
    int i;
4626

4627
    /* Deal with pixel shaders first so the vertex shader arg function has the input signature ready */
4628 4629
    if (usePS)
    {
4630
        struct wined3d_shader *ps = state->pixel_shader;
4631
        struct arb_ps_compile_args compile_args;
4632
        struct arb_ps_compiled_shader *compiled;
4633

4634
        TRACE("Using pixel shader %p.\n", ps);
4635
        find_arb_ps_compile_args(state, ps, &compile_args);
4636
        compiled = find_arb_pshader(ps, &compile_args);
4637 4638
        priv->current_fprogram_id = compiled->prgId;
        priv->compiled_fprog = compiled;
4639 4640 4641 4642 4643 4644 4645 4646 4647 4648

        /* Bind the fragment program */
        GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
        checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id);");

        if(!priv->use_arbfp_fixed_func) {
            /* Enable OpenGL fragment programs */
            glEnable(GL_FRAGMENT_PROGRAM_ARB);
            checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
        }
4649 4650
        TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n",
                device, priv->current_fprogram_id);
4651

4652 4653 4654
        /* Pixel Shader 1.x constants are clamped to [-1;1], Pixel Shader 2.0 constants are not. If switching between
         * a 1.x and newer shader, reload the first 8 constants
         */
4655
        if (priv->last_ps_const_clamped != ((struct arb_pshader_private *)ps->backend_data)->clamp_consts)
4656
        {
4657
            priv->last_ps_const_clamped = ((struct arb_pshader_private *)ps->backend_data)->clamp_consts;
4658
            priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, 8);
4659 4660
            for(i = 0; i < 8; i++)
            {
4661
                priv->pshader_const_dirty[i] = 1;
4662 4663
            }
            /* Also takes care of loading local constants */
4664
            shader_arb_load_constants(context, TRUE, FALSE);
4665 4666 4667
        }
        else
        {
4668
            UINT rt_height = state->fb->render_targets[0]->resource.height;
4669
            shader_arb_ps_local_constants(compiled, context, state, rt_height);
4670
        }
4671 4672

        /* Force constant reloading for the NP2 fixup (see comment in shader_glsl_select for more info) */
4673
        if (compiled->np2fixup_info.super.active)
4674
            shader_arb_load_np2fixup_constants(priv, gl_info, state);
4675 4676 4677
    }
    else if (gl_info->supported[ARB_FRAGMENT_PROGRAM] && !priv->use_arbfp_fixed_func)
    {
4678
        /* Disable only if we're not using arbfp fixed function fragment processing. If this is used,
4679 4680 4681
        * keep GL_FRAGMENT_PROGRAM_ARB enabled, and the fixed function pipeline will bind the fixed function
        * replacement shader
        */
4682 4683 4684 4685
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
        checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
        priv->current_fprogram_id = 0;
    }
4686

4687 4688
    if (useVS)
    {
4689
        struct wined3d_shader *vs = state->vertex_shader;
4690 4691 4692
        struct arb_vs_compile_args compile_args;
        struct arb_vs_compiled_shader *compiled;

4693
        TRACE("Using vertex shader %p\n", vs);
4694
        find_arb_vs_compile_args(state, vs, &compile_args);
4695
        compiled = find_arb_vshader(vs, &compile_args);
4696 4697 4698 4699 4700 4701 4702 4703 4704 4705
        priv->current_vprogram_id = compiled->prgId;
        priv->compiled_vprog = compiled;

        /* Bind the vertex program */
        GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
        checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id);");

        /* Enable OpenGL vertex programs */
        glEnable(GL_VERTEX_PROGRAM_ARB);
        checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
4706
        TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", device, priv->current_vprogram_id);
4707
        shader_arb_vs_local_constants(compiled, context, state);
4708 4709 4710 4711

        if(priv->last_vs_color_unclamp != compiled->need_color_unclamp) {
            priv->last_vs_color_unclamp = compiled->need_color_unclamp;

4712 4713
            if (gl_info->supported[ARB_COLOR_BUFFER_FLOAT])
            {
4714 4715 4716 4717 4718 4719
                GL_EXTCALL(glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, !compiled->need_color_unclamp));
                checkGLcall("glClampColorARB");
            } else {
                FIXME("vertex color clamp needs to be changed, but extension not supported.\n");
            }
        }
4720 4721 4722
    }
    else if (gl_info->supported[ARB_VERTEX_PROGRAM])
    {
4723 4724 4725 4726
        priv->current_vprogram_id = 0;
        glDisable(GL_VERTEX_PROGRAM_ARB);
        checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
    }
4727 4728 4729
}

/* GL locking is done by the caller */
4730 4731
static void shader_arb_select_depth_blt(void *shader_priv, const struct wined3d_gl_info *gl_info,
        enum tex_types tex_type, const SIZE *ds_mask_size)
4732 4733 4734
{
    const float mask[] = {0.0f, 0.0f, (float)ds_mask_size->cx, (float)ds_mask_size->cy};
    BOOL masked = ds_mask_size->cx && ds_mask_size->cy;
4735
    struct shader_arb_priv *priv = shader_priv;
4736
    GLuint *blt_fprogram;
4737 4738 4739 4740 4741

    if (!priv->depth_blt_vprogram_id) priv->depth_blt_vprogram_id = create_arb_blt_vertex_program(gl_info);
    GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->depth_blt_vprogram_id));
    glEnable(GL_VERTEX_PROGRAM_ARB);

4742 4743
    blt_fprogram = masked ? &priv->depth_blt_fprogram_id_masked[tex_type] : &priv->depth_blt_fprogram_id_full[tex_type];
    if (!*blt_fprogram) *blt_fprogram = create_arb_blt_fragment_program(gl_info, tex_type, masked);
4744
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, *blt_fprogram));
4745
    if (masked) GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, mask));
4746 4747 4748 4749
    glEnable(GL_FRAGMENT_PROGRAM_ARB);
}

/* GL locking is done by the caller */
4750 4751 4752
static void shader_arb_deselect_depth_blt(void *shader_priv, const struct wined3d_gl_info *gl_info)
{
    struct shader_arb_priv *priv = shader_priv;
4753 4754 4755 4756 4757

    if (priv->current_vprogram_id) {
        GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
        checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexShader->prgId);");

4758 4759 4760 4761
        TRACE("Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB.\n", priv->current_vprogram_id);
    }
    else
    {
4762 4763 4764 4765 4766 4767 4768 4769
        glDisable(GL_VERTEX_PROGRAM_ARB);
        checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
    }

    if (priv->current_fprogram_id) {
        GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
        checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, pixelShader->prgId);");

4770 4771 4772 4773
        TRACE("Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB.\n", priv->current_fprogram_id);
    }
    else if(!priv->use_arbfp_fixed_func)
    {
4774 4775 4776 4777 4778
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
        checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
    }
}

4779
static void shader_arb_destroy(struct wined3d_shader *shader)
4780
{
4781
    struct wined3d_device *device = shader->device;
4782
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
4783

4784
    if (shader_is_pshader_version(shader->reg_maps.shader_version.type))
4785
    {
4786
        struct arb_pshader_private *shader_data = shader->backend_data;
4787 4788
        UINT i;

4789
        if(!shader_data) return; /* This can happen if a shader was never compiled */
4790

4791 4792
        if (shader_data->num_gl_shaders)
        {
4793
            struct wined3d_context *context = context_acquire(device, NULL);
4794 4795 4796 4797 4798 4799 4800 4801

            ENTER_GL();
            for (i = 0; i < shader_data->num_gl_shaders; ++i)
            {
                GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
                checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
            }
            LEAVE_GL();
4802

4803
            context_release(context);
4804
        }
4805

4806 4807
        HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
        HeapFree(GetProcessHeap(), 0, shader_data);
4808
        shader->backend_data = NULL;
4809 4810 4811
    }
    else
    {
4812
        struct arb_vshader_private *shader_data = shader->backend_data;
4813 4814
        UINT i;

4815
        if(!shader_data) return; /* This can happen if a shader was never compiled */
4816

4817 4818
        if (shader_data->num_gl_shaders)
        {
4819
            struct wined3d_context *context = context_acquire(device, NULL);
4820 4821 4822 4823 4824 4825 4826 4827

            ENTER_GL();
            for (i = 0; i < shader_data->num_gl_shaders; ++i)
            {
                GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
                checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
            }
            LEAVE_GL();
4828

4829
            context_release(context);
4830
        }
4831

4832 4833
        HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
        HeapFree(GetProcessHeap(), 0, shader_data);
4834
        shader->backend_data = NULL;
4835 4836 4837
    }
}

4838 4839 4840 4841 4842 4843
static int sig_tree_compare(const void *key, const struct wine_rb_entry *entry)
{
    struct ps_signature *e = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
    return compare_sig(key, e->sig);
}

4844
static const struct wine_rb_functions sig_tree_functions =
4845 4846 4847 4848 4849 4850 4851
{
    wined3d_rb_alloc,
    wined3d_rb_realloc,
    wined3d_rb_free,
    sig_tree_compare
};

4852
static HRESULT shader_arb_alloc(struct wined3d_device *device)
4853
{
4854
    struct shader_arb_priv *priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*priv));
4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869

    priv->vshader_const_dirty = HeapAlloc(GetProcessHeap(), 0,
            sizeof(*priv->vshader_const_dirty) * device->d3d_vshader_constantF);
    if (!priv->vshader_const_dirty)
        goto fail;
    memset(priv->vshader_const_dirty, 1,
           sizeof(*priv->vshader_const_dirty) * device->d3d_vshader_constantF);

    priv->pshader_const_dirty = HeapAlloc(GetProcessHeap(), 0,
            sizeof(*priv->pshader_const_dirty) * device->d3d_pshader_constantF);
    if (!priv->pshader_const_dirty)
        goto fail;
    memset(priv->pshader_const_dirty, 1,
            sizeof(*priv->pshader_const_dirty) * device->d3d_pshader_constantF);

4870 4871 4872
    if(wine_rb_init(&priv->signature_tree, &sig_tree_functions) == -1)
    {
        ERR("RB tree init failed\n");
4873
        goto fail;
4874
    }
4875
    device->shader_priv = priv;
4876
    return WINED3D_OK;
4877 4878 4879 4880 4881 4882

fail:
    HeapFree(GetProcessHeap(), 0, priv->pshader_const_dirty);
    HeapFree(GetProcessHeap(), 0, priv->vshader_const_dirty);
    HeapFree(GetProcessHeap(), 0, priv);
    return E_OUTOFMEMORY;
4883 4884
}

4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896
static void release_signature(struct wine_rb_entry *entry, void *context)
{
    struct ps_signature *sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
    int i;
    for(i = 0; i < MAX_REG_INPUT; i++)
    {
        HeapFree(GetProcessHeap(), 0, (char *) sig->sig[i].semantic_name);
    }
    HeapFree(GetProcessHeap(), 0, sig->sig);
    HeapFree(GetProcessHeap(), 0, sig);
}

4897
/* Context activation is done by the caller. */
4898
static void shader_arb_free(struct wined3d_device *device)
4899 4900 4901
{
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
    struct shader_arb_priv *priv = device->shader_priv;
4902 4903 4904 4905 4906 4907
    int i;

    ENTER_GL();
    if(priv->depth_blt_vprogram_id) {
        GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_vprogram_id));
    }
4908 4909 4910 4911 4912 4913 4914 4915 4916
    for (i = 0; i < tex_type_count; ++i)
    {
        if (priv->depth_blt_fprogram_id_full[i])
        {
            GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id_full[i]));
        }
        if (priv->depth_blt_fprogram_id_masked[i])
        {
            GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id_masked[i]));
4917 4918 4919 4920
        }
    }
    LEAVE_GL();

4921
    wine_rb_destroy(&priv->signature_tree, release_signature, NULL);
4922 4923
    HeapFree(GetProcessHeap(), 0, priv->pshader_const_dirty);
    HeapFree(GetProcessHeap(), 0, priv->vshader_const_dirty);
4924
    HeapFree(GetProcessHeap(), 0, device->shader_priv);
4925 4926
}

4927
static void shader_arb_context_destroyed(void *shader_priv, const struct wined3d_context *context)
4928
{
4929 4930 4931 4932
    struct shader_arb_priv *priv = shader_priv;

    if (priv->last_context == context)
        priv->last_context = NULL;
4933 4934
}

4935
static void shader_arb_get_caps(const struct wined3d_gl_info *gl_info, struct shader_caps *caps)
4936
{
4937 4938
    if (gl_info->supported[ARB_VERTEX_PROGRAM])
    {
4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949
        DWORD vs_consts;

        /* 96 is the minimum allowed value of MAX_PROGRAM_ENV_PARAMETERS_ARB
         * for vertex programs. If the native limit is less than that it's
         * not very useful, and e.g. Mesa swrast returns 0, probably to
         * indicate it's a software implementation. */
        if (gl_info->limits.arb_vs_native_constants < 96)
            vs_consts = gl_info->limits.arb_vs_float_constants;
        else
            vs_consts = min(gl_info->limits.arb_vs_float_constants, gl_info->limits.arb_vs_native_constants);

4950
        if (gl_info->supported[NV_VERTEX_PROGRAM3])
4951
        {
4952
            caps->VertexShaderVersion = 3;
4953 4954
            TRACE_(d3d_caps)("Hardware vertex shader version 3.0 enabled (NV_VERTEX_PROGRAM3)\n");
        }
4955
        else if (vs_consts >= 256)
4956 4957
        {
            /* Shader Model 2.0 requires at least 256 vertex shader constants */
4958
            caps->VertexShaderVersion = 2;
4959 4960 4961 4962
            TRACE_(d3d_caps)("Hardware vertex shader version 2.0 enabled (ARB_PROGRAM)\n");
        }
        else
        {
4963
            caps->VertexShaderVersion = 1;
4964 4965
            TRACE_(d3d_caps)("Hardware vertex shader version 1.1 enabled (ARB_PROGRAM)\n");
        }
4966
        caps->MaxVertexShaderConst = vs_consts;
4967
    }
4968 4969
    else
    {
4970 4971
        caps->VertexShaderVersion = 0;
        caps->MaxVertexShaderConst = 0;
4972
    }
4973

4974 4975
    if (gl_info->supported[ARB_FRAGMENT_PROGRAM])
    {
4976 4977 4978 4979 4980 4981 4982 4983 4984
        DWORD ps_consts;

        /* Similar as above for vertex programs, but the minimum for fragment
         * programs is 24. */
        if (gl_info->limits.arb_ps_native_constants < 24)
            ps_consts = gl_info->limits.arb_ps_float_constants;
        else
            ps_consts = min(gl_info->limits.arb_ps_float_constants, gl_info->limits.arb_ps_native_constants);

4985
        if (gl_info->supported[NV_FRAGMENT_PROGRAM2])
4986
        {
4987
            caps->PixelShaderVersion = 3;
4988 4989
            TRACE_(d3d_caps)("Hardware pixel shader version 3.0 enabled (NV_FRAGMENT_PROGRAM2)\n");
        }
4990
        else if (ps_consts >= 32)
4991
        {
4992
            /* Shader Model 2.0 requires at least 32 pixel shader constants */
4993
            caps->PixelShaderVersion = 2;
4994 4995 4996 4997
            TRACE_(d3d_caps)("Hardware pixel shader version 2.0 enabled (ARB_PROGRAM)\n");
        }
        else
        {
4998
            caps->PixelShaderVersion = 1;
4999 5000
            TRACE_(d3d_caps)("Hardware pixel shader version 1.4 enabled (ARB_PROGRAM)\n");
        }
5001 5002
        caps->PixelShader1xMaxValue = 8.0f;
        caps->MaxPixelShaderConst = ps_consts;
5003
    }
5004 5005
    else
    {
5006 5007 5008
        caps->PixelShaderVersion = 0;
        caps->PixelShader1xMaxValue = 0.0f;
        caps->MaxPixelShaderConst = 0;
5009
    }
5010

5011
    caps->VSClipping = use_nv_clip(gl_info);
5012 5013
}

5014 5015 5016 5017 5018 5019
static BOOL shader_arb_color_fixup_supported(struct color_fixup_desc fixup)
{
    if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
    {
        TRACE("Checking support for color_fixup:\n");
        dump_color_fixup_desc(fixup);
5020
    }
5021

5022 5023
    /* We support everything except complex conversions. */
    if (!is_complex_fixup(fixup))
5024 5025 5026 5027 5028 5029 5030
    {
        TRACE("[OK]\n");
        return TRUE;
    }

    TRACE("[FAILED]\n");
    return FALSE;
5031 5032
}

5033
static void shader_arb_add_instruction_modifiers(const struct wined3d_shader_instruction *ins) {
5034 5035
    DWORD shift;
    char write_mask[20], regstr[50];
5036
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
5037 5038 5039 5040 5041 5042 5043
    BOOL is_color = FALSE;
    const struct wined3d_shader_dst_param *dst;

    if (!ins->dst_count) return;

    dst = &ins->dst[0];
    shift = dst->shift;
5044
    if (!shift) return; /* Saturate alone is handled by the instructions */
5045 5046

    shader_arb_get_write_mask(ins, dst, write_mask);
5047
    shader_arb_get_register_name(ins, &dst->reg, regstr, &is_color);
5048

5049 5050 5051 5052 5053
    /* Generate a line that does the output modifier computation
     * FIXME: _SAT vs shift? _SAT alone is already handled in the instructions, if this
     * maps problems in e.g. _d4_sat modify shader_arb_get_modifier
     */
    shader_addline(buffer, "MUL%s %s%s, %s, %s;\n", shader_arb_get_modifier(ins),
5054
                   regstr, write_mask, regstr, shift_tab[shift]);
5055 5056
}

5057 5058 5059 5060
static const SHADER_HANDLER shader_arb_instruction_handler_table[WINED3DSIH_TABLE_SIZE] =
{
    /* WINED3DSIH_ABS           */ shader_hw_map2gl,
    /* WINED3DSIH_ADD           */ shader_hw_map2gl,
5061
    /* WINED3DSIH_AND           */ NULL,
5062
    /* WINED3DSIH_BEM           */ pshader_hw_bem,
5063 5064
    /* WINED3DSIH_BREAK         */ shader_hw_break,
    /* WINED3DSIH_BREAKC        */ shader_hw_breakc,
5065
    /* WINED3DSIH_BREAKP        */ NULL,
5066
    /* WINED3DSIH_CALL          */ shader_hw_call,
5067 5068 5069 5070
    /* WINED3DSIH_CALLNZ        */ NULL,
    /* WINED3DSIH_CMP           */ pshader_hw_cmp,
    /* WINED3DSIH_CND           */ pshader_hw_cnd,
    /* WINED3DSIH_CRS           */ shader_hw_map2gl,
5071
    /* WINED3DSIH_CUT           */ NULL,
5072 5073 5074 5075
    /* WINED3DSIH_DCL           */ NULL,
    /* WINED3DSIH_DEF           */ NULL,
    /* WINED3DSIH_DEFB          */ NULL,
    /* WINED3DSIH_DEFI          */ NULL,
5076
    /* WINED3DSIH_DIV           */ NULL,
5077 5078 5079 5080
    /* WINED3DSIH_DP2ADD        */ pshader_hw_dp2add,
    /* WINED3DSIH_DP3           */ shader_hw_map2gl,
    /* WINED3DSIH_DP4           */ shader_hw_map2gl,
    /* WINED3DSIH_DST           */ shader_hw_map2gl,
5081
    /* WINED3DSIH_DSX           */ shader_hw_map2gl,
5082
    /* WINED3DSIH_DSY           */ shader_hw_dsy,
5083
    /* WINED3DSIH_ELSE          */ shader_hw_else,
5084
    /* WINED3DSIH_EMIT          */ NULL,
5085
    /* WINED3DSIH_ENDIF         */ shader_hw_endif,
5086 5087
    /* WINED3DSIH_ENDLOOP       */ shader_hw_endloop,
    /* WINED3DSIH_ENDREP        */ shader_hw_endrep,
5088
    /* WINED3DSIH_EQ            */ NULL,
5089 5090
    /* WINED3DSIH_EXP           */ shader_hw_scalar_op,
    /* WINED3DSIH_EXPP          */ shader_hw_scalar_op,
5091
    /* WINED3DSIH_FRC           */ shader_hw_map2gl,
5092
    /* WINED3DSIH_FTOI          */ NULL,
5093
    /* WINED3DSIH_GE            */ NULL,
5094
    /* WINED3DSIH_IADD          */ NULL,
5095
    /* WINED3DSIH_IEQ           */ NULL,
5096 5097
    /* WINED3DSIH_IF            */ NULL /* Hardcoded into the shader */,
    /* WINED3DSIH_IFC           */ shader_hw_ifc,
5098
    /* WINED3DSIH_IGE           */ NULL,
5099
    /* WINED3DSIH_IMUL          */ NULL,
5100
    /* WINED3DSIH_ITOF          */ NULL,
5101
    /* WINED3DSIH_LABEL         */ shader_hw_label,
5102
    /* WINED3DSIH_LD            */ NULL,
5103
    /* WINED3DSIH_LIT           */ shader_hw_map2gl,
5104 5105
    /* WINED3DSIH_LOG           */ shader_hw_log,
    /* WINED3DSIH_LOGP          */ shader_hw_log,
5106
    /* WINED3DSIH_LOOP          */ shader_hw_loop,
5107
    /* WINED3DSIH_LRP           */ shader_hw_lrp,
5108
    /* WINED3DSIH_LT            */ NULL,
5109 5110 5111 5112 5113 5114 5115 5116
    /* WINED3DSIH_M3x2          */ shader_hw_mnxn,
    /* WINED3DSIH_M3x3          */ shader_hw_mnxn,
    /* WINED3DSIH_M3x4          */ shader_hw_mnxn,
    /* WINED3DSIH_M4x3          */ shader_hw_mnxn,
    /* WINED3DSIH_M4x4          */ shader_hw_mnxn,
    /* WINED3DSIH_MAD           */ shader_hw_map2gl,
    /* WINED3DSIH_MAX           */ shader_hw_map2gl,
    /* WINED3DSIH_MIN           */ shader_hw_map2gl,
5117 5118
    /* WINED3DSIH_MOV           */ shader_hw_mov,
    /* WINED3DSIH_MOVA          */ shader_hw_mov,
5119
    /* WINED3DSIH_MOVC          */ NULL,
5120
    /* WINED3DSIH_MUL           */ shader_hw_map2gl,
5121
    /* WINED3DSIH_NOP           */ shader_hw_nop,
5122 5123
    /* WINED3DSIH_NRM           */ shader_hw_nrm,
    /* WINED3DSIH_PHASE         */ NULL,
5124
    /* WINED3DSIH_POW           */ shader_hw_pow,
5125
    /* WINED3DSIH_RCP           */ shader_hw_rcp,
5126
    /* WINED3DSIH_REP           */ shader_hw_rep,
5127
    /* WINED3DSIH_RET           */ shader_hw_ret,
5128
    /* WINED3DSIH_ROUND_NI      */ NULL,
5129
    /* WINED3DSIH_RSQ           */ shader_hw_scalar_op,
5130
    /* WINED3DSIH_SAMPLE        */ NULL,
5131
    /* WINED3DSIH_SAMPLE_GRAD   */ NULL,
5132
    /* WINED3DSIH_SAMPLE_LOD    */ NULL,
5133 5134
    /* WINED3DSIH_SETP          */ NULL,
    /* WINED3DSIH_SGE           */ shader_hw_map2gl,
5135
    /* WINED3DSIH_SGN           */ shader_hw_sgn,
5136 5137
    /* WINED3DSIH_SINCOS        */ shader_hw_sincos,
    /* WINED3DSIH_SLT           */ shader_hw_map2gl,
5138
    /* WINED3DSIH_SQRT          */ NULL,
5139 5140 5141 5142 5143 5144 5145 5146 5147
    /* WINED3DSIH_SUB           */ shader_hw_map2gl,
    /* WINED3DSIH_TEX           */ pshader_hw_tex,
    /* WINED3DSIH_TEXBEM        */ pshader_hw_texbem,
    /* WINED3DSIH_TEXBEML       */ pshader_hw_texbem,
    /* WINED3DSIH_TEXCOORD      */ pshader_hw_texcoord,
    /* WINED3DSIH_TEXDEPTH      */ pshader_hw_texdepth,
    /* WINED3DSIH_TEXDP3        */ pshader_hw_texdp3,
    /* WINED3DSIH_TEXDP3TEX     */ pshader_hw_texdp3tex,
    /* WINED3DSIH_TEXKILL       */ pshader_hw_texkill,
5148
    /* WINED3DSIH_TEXLDD        */ shader_hw_texldd,
5149
    /* WINED3DSIH_TEXLDL        */ shader_hw_texldl,
5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161
    /* WINED3DSIH_TEXM3x2DEPTH  */ pshader_hw_texm3x2depth,
    /* WINED3DSIH_TEXM3x2PAD    */ pshader_hw_texm3x2pad,
    /* WINED3DSIH_TEXM3x2TEX    */ pshader_hw_texm3x2tex,
    /* WINED3DSIH_TEXM3x3       */ pshader_hw_texm3x3,
    /* WINED3DSIH_TEXM3x3DIFF   */ NULL,
    /* WINED3DSIH_TEXM3x3PAD    */ pshader_hw_texm3x3pad,
    /* WINED3DSIH_TEXM3x3SPEC   */ pshader_hw_texm3x3spec,
    /* WINED3DSIH_TEXM3x3TEX    */ pshader_hw_texm3x3tex,
    /* WINED3DSIH_TEXM3x3VSPEC  */ pshader_hw_texm3x3vspec,
    /* WINED3DSIH_TEXREG2AR     */ pshader_hw_texreg2ar,
    /* WINED3DSIH_TEXREG2GB     */ pshader_hw_texreg2gb,
    /* WINED3DSIH_TEXREG2RGB    */ pshader_hw_texreg2rgb,
5162
    /* WINED3DSIH_UDIV          */ NULL,
5163
    /* WINED3DSIH_USHR          */ NULL,
5164
    /* WINED3DSIH_UTOF          */ NULL,
5165
    /* WINED3DSIH_XOR           */ NULL,
5166 5167
};

5168
static BOOL get_bool_const(const struct wined3d_shader_instruction *ins,
5169
        const struct wined3d_shader *shader, DWORD idx)
5170
{
5171
    const struct wined3d_shader_reg_maps *reg_maps = ins->ctx->reg_maps;
5172
    BOOL vshader = shader_is_vshader_version(reg_maps->shader_version.type);
5173
    const struct wined3d_shader_lconst *constant;
5174 5175 5176 5177
    WORD bools = 0;
    WORD flag = (1 << idx);
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;

5178
    if (reg_maps->local_bool_consts & flag)
5179 5180
    {
        /* What good is a if(bool) with a hardcoded local constant? I don't know, but handle it */
5181
        LIST_FOR_EACH_ENTRY(constant, &shader->constantsB, struct wined3d_shader_lconst, entry)
5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192
        {
            if (constant->idx == idx)
            {
                return constant->value[0];
            }
        }
        ERR("Local constant not found\n");
        return FALSE;
    }
    else
    {
5193
        if(vshader) bools = priv->cur_vs_args->clip.boolclip.bools;
5194 5195 5196 5197 5198
        else bools = priv->cur_ps_args->bools;
        return bools & flag;
    }
}

5199
static void get_loop_control_const(const struct wined3d_shader_instruction *ins,
5200
        const struct wined3d_shader *shader, UINT idx, struct wined3d_shader_loop_control *loop_control)
5201
{
5202
    const struct wined3d_shader_reg_maps *reg_maps = ins->ctx->reg_maps;
5203 5204 5205
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;

    /* Integer constants can either be a local constant, or they can be stored in the shader
5206
     * type specific compile args. */
5207
    if (reg_maps->local_int_consts & (1 << idx))
5208
    {
5209
        const struct wined3d_shader_lconst *constant;
5210

5211
        LIST_FOR_EACH_ENTRY(constant, &shader->constantsI, struct wined3d_shader_lconst, entry)
5212 5213 5214
        {
            if (constant->idx == idx)
            {
5215 5216 5217 5218
                loop_control->count = constant->value[0];
                loop_control->start = constant->value[1];
                /* Step is signed. */
                loop_control->step = (int)constant->value[2];
5219 5220 5221 5222 5223
                return;
            }
        }
        /* If this happens the flag was set incorrectly */
        ERR("Local constant not found\n");
5224 5225 5226
        loop_control->count = 0;
        loop_control->start = 0;
        loop_control->step = 0;
5227 5228
        return;
    }
5229

5230
    switch (reg_maps->shader_version.type)
5231
    {
5232
        case WINED3D_SHADER_TYPE_VERTEX:
5233
            /* Count and aL start value are unsigned */
5234 5235 5236 5237
            loop_control->count = priv->cur_vs_args->loop_ctrl[idx][0];
            loop_control->start = priv->cur_vs_args->loop_ctrl[idx][1];
            /* Step is signed. */
            loop_control->step = ((char)priv->cur_vs_args->loop_ctrl[idx][2]);
5238 5239 5240
            break;

        case WINED3D_SHADER_TYPE_PIXEL:
5241 5242 5243
            loop_control->count = priv->cur_ps_args->loop_ctrl[idx][0];
            loop_control->start = priv->cur_ps_args->loop_ctrl[idx][1];
            loop_control->step = ((char)priv->cur_ps_args->loop_ctrl[idx][2]);
5244 5245 5246
            break;

        default:
5247
            FIXME("Unhandled shader type %#x.\n", reg_maps->shader_version.type);
5248
            break;
5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336
    }
}

static void record_instruction(struct list *list, const struct wined3d_shader_instruction *ins)
{
    unsigned int i;
    struct wined3d_shader_dst_param *dst_param = NULL;
    struct wined3d_shader_src_param *src_param = NULL, *rel_addr = NULL;
    struct recorded_instruction *rec = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*rec));
    if(!rec)
    {
        ERR("Out of memory\n");
        return;
    }

    rec->ins = *ins;
    dst_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param));
    if(!dst_param) goto free;
    *dst_param = *ins->dst;
    if(ins->dst->reg.rel_addr)
    {
        rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param->reg.rel_addr));
        if(!rel_addr) goto free;
        *rel_addr = *ins->dst->reg.rel_addr;
        dst_param->reg.rel_addr = rel_addr;
    }
    rec->ins.dst = dst_param;

    src_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*src_param) * ins->src_count);
    if(!src_param) goto free;
    for(i = 0; i < ins->src_count; i++)
    {
        src_param[i] = ins->src[i];
        if(ins->src[i].reg.rel_addr)
        {
            rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*rel_addr));
            if(!rel_addr) goto free;
            *rel_addr = *ins->src[i].reg.rel_addr;
            src_param[i].reg.rel_addr = rel_addr;
        }
    }
    rec->ins.src = src_param;
    list_add_tail(list, &rec->entry);
    return;

free:
    ERR("Out of memory\n");
    if(dst_param)
    {
        HeapFree(GetProcessHeap(), 0, (void *) dst_param->reg.rel_addr);
        HeapFree(GetProcessHeap(), 0, dst_param);
    }
    if(src_param)
    {
        for(i = 0; i < ins->src_count; i++)
        {
            HeapFree(GetProcessHeap(), 0, (void *) src_param[i].reg.rel_addr);
        }
        HeapFree(GetProcessHeap(), 0, src_param);
    }
    HeapFree(GetProcessHeap(), 0, rec);
}

static void free_recorded_instruction(struct list *list)
{
    struct recorded_instruction *rec_ins, *entry2;
    unsigned int i;

    LIST_FOR_EACH_ENTRY_SAFE(rec_ins, entry2, list, struct recorded_instruction, entry)
    {
        list_remove(&rec_ins->entry);
        if(rec_ins->ins.dst)
        {
            HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst->reg.rel_addr);
            HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst);
        }
        if(rec_ins->ins.src)
        {
            for(i = 0; i < rec_ins->ins.src_count; i++)
            {
                HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src[i].reg.rel_addr);
            }
            HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src);
        }
        HeapFree(GetProcessHeap(), 0, rec_ins);
    }
}

5337 5338
static void shader_arb_handle_instruction(const struct wined3d_shader_instruction *ins) {
    SHADER_HANDLER hw_fct;
5339
    struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
5340
    const struct wined3d_shader *shader = ins->ctx->shader;
5341
    struct control_frame *control_frame;
5342
    struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
5343
    BOOL bool_const;
5344

5345 5346 5347 5348 5349
    if(ins->handler_idx == WINED3DSIH_LOOP || ins->handler_idx == WINED3DSIH_REP)
    {
        control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
        list_add_head(&priv->control_frames, &control_frame->entry);

5350 5351 5352
        if(ins->handler_idx == WINED3DSIH_LOOP) control_frame->type = LOOP;
        if(ins->handler_idx == WINED3DSIH_REP) control_frame->type = REP;

5353 5354
        if(priv->target_version >= NV2)
        {
5355
            control_frame->no.loop = priv->num_loops++;
5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370
            priv->loop_depth++;
        }
        else
        {
            /* Don't bother recording when we're in a not used if branch */
            if(priv->muted)
            {
                return;
            }

            if(!priv->recording)
            {
                list_init(&priv->record);
                priv->recording = TRUE;
                control_frame->outer_loop = TRUE;
5371
                get_loop_control_const(ins, shader, ins->src[0].reg.idx, &control_frame->loop_control);
5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390
                return; /* Instruction is handled */
            }
            /* Record this loop in the outer loop's recording */
        }
    }
    else if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
    {
        if(priv->target_version >= NV2)
        {
            /* Nothing to do. The control frame is popped after the HW instr handler */
        }
        else
        {
            struct list *e = list_head(&priv->control_frames);
            control_frame = LIST_ENTRY(e, struct control_frame, entry);
            list_remove(&control_frame->entry);

            if(control_frame->outer_loop)
            {
5391 5392
                unsigned int iteration;
                int aL = 0;
5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407
                struct list copy;

                /* Turn off recording before playback */
                priv->recording = FALSE;

                /* Move the recorded instructions to a separate list and get them out of the private data
                 * structure. If there are nested loops, the shader_arb_handle_instruction below will
                 * be recorded again, thus priv->record might be overwritten
                 */
                list_init(&copy);
                list_move_tail(&copy, &priv->record);
                list_init(&priv->record);

                if(ins->handler_idx == WINED3DSIH_ENDLOOP)
                {
5408 5409 5410 5411
                    shader_addline(buffer, "#unrolling loop: %u iterations, aL=%u, inc %d\n",
                                   control_frame->loop_control.count, control_frame->loop_control.start,
                                   control_frame->loop_control.step);
                    aL = control_frame->loop_control.start;
5412 5413 5414
                }
                else
                {
5415
                    shader_addline(buffer, "#unrolling rep: %u iterations\n", control_frame->loop_control.count);
5416 5417
                }

5418
                for (iteration = 0; iteration < control_frame->loop_control.count; ++iteration)
5419 5420 5421 5422 5423
                {
                    struct recorded_instruction *rec_ins;
                    if(ins->handler_idx == WINED3DSIH_ENDLOOP)
                    {
                        priv->aL = aL;
5424
                        shader_addline(buffer, "#Iteration %u, aL=%d\n", iteration, aL);
5425 5426 5427
                    }
                    else
                    {
5428
                        shader_addline(buffer, "#Iteration %u\n", iteration);
5429 5430 5431 5432 5433 5434 5435 5436 5437
                    }

                    LIST_FOR_EACH_ENTRY(rec_ins, &copy, struct recorded_instruction, entry)
                    {
                        shader_arb_handle_instruction(&rec_ins->ins);
                    }

                    if(ins->handler_idx == WINED3DSIH_ENDLOOP)
                    {
5438
                        aL += control_frame->loop_control.step;
5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460
                    }
                }
                shader_addline(buffer, "#end loop/rep\n");

                free_recorded_instruction(&copy);
                HeapFree(GetProcessHeap(), 0, control_frame);
                return; /* Instruction is handled */
            }
            else
            {
                /* This is a nested loop. Proceed to the normal recording function */
                HeapFree(GetProcessHeap(), 0, control_frame);
            }
        }
    }

    if(priv->recording)
    {
        record_instruction(&priv->record, ins);
        return;
    }

5461 5462 5463
    /* boolean if */
    if(ins->handler_idx == WINED3DSIH_IF)
    {
5464 5465
        control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
        list_add_head(&priv->control_frames, &control_frame->entry);
5466
        control_frame->type = IF;
5467

5468
        bool_const = get_bool_const(ins, shader, ins->src[0].reg.idx);
5469
        if(ins->src[0].modifiers == WINED3DSPSM_NOT) bool_const = !bool_const;
5470
        if (!priv->muted && !bool_const)
5471 5472 5473
        {
            shader_addline(buffer, "#if(FALSE){\n");
            priv->muted = TRUE;
5474
            control_frame->muting = TRUE;
5475 5476 5477 5478 5479 5480 5481 5482
        }
        else shader_addline(buffer, "#if(TRUE) {\n");

        return; /* Instruction is handled */
    }
    else if(ins->handler_idx == WINED3DSIH_IFC)
    {
        /* IF(bool) and if_cond(a, b) use the same ELSE and ENDIF tokens */
5483
        control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
5484
        control_frame->type = IFC;
5485
        control_frame->no.ifc = priv->num_ifcs++;
5486
        list_add_head(&priv->control_frames, &control_frame->entry);
5487 5488 5489
    }
    else if(ins->handler_idx == WINED3DSIH_ELSE)
    {
5490 5491
        struct list *e = list_head(&priv->control_frames);
        control_frame = LIST_ENTRY(e, struct control_frame, entry);
5492

5493
        if(control_frame->type == IF)
5494 5495
        {
            shader_addline(buffer, "#} else {\n");
5496
            if(!priv->muted && !control_frame->muting)
5497 5498
            {
                priv->muted = TRUE;
5499
                control_frame->muting = TRUE;
5500
            }
5501
            else if(control_frame->muting) priv->muted = FALSE;
5502 5503 5504 5505 5506 5507
            return; /* Instruction is handled. */
        }
        /* In case of an ifc, generate a HW shader instruction */
    }
    else if(ins->handler_idx == WINED3DSIH_ENDIF)
    {
5508 5509
        struct list *e = list_head(&priv->control_frames);
        control_frame = LIST_ENTRY(e, struct control_frame, entry);
5510

5511
        if(control_frame->type == IF)
5512 5513
        {
            shader_addline(buffer, "#} endif\n");
5514 5515 5516
            if(control_frame->muting) priv->muted = FALSE;
            list_remove(&control_frame->entry);
            HeapFree(GetProcessHeap(), 0, control_frame);
5517 5518 5519 5520 5521
            return; /* Instruction is handled */
        }
    }

    if(priv->muted) return;
5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533

    /* Select handler */
    hw_fct = shader_arb_instruction_handler_table[ins->handler_idx];

    /* Unhandled opcode */
    if (!hw_fct)
    {
        FIXME("Backend can't handle opcode %#x\n", ins->handler_idx);
        return;
    }
    hw_fct(ins);

5534 5535 5536 5537 5538 5539 5540 5541
    if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
    {
        struct list *e = list_head(&priv->control_frames);
        control_frame = LIST_ENTRY(e, struct control_frame, entry);
        list_remove(&control_frame->entry);
        HeapFree(GetProcessHeap(), 0, control_frame);
        priv->loop_depth--;
    }
5542 5543 5544 5545 5546 5547 5548 5549 5550
    else if(ins->handler_idx == WINED3DSIH_ENDIF)
    {
        /* Non-ifc ENDIFs don't reach that place because of the return in the if block above */
        struct list *e = list_head(&priv->control_frames);
        control_frame = LIST_ENTRY(e, struct control_frame, entry);
        list_remove(&control_frame->entry);
        HeapFree(GetProcessHeap(), 0, control_frame);
    }

5551

5552 5553 5554
    shader_arb_add_instruction_modifiers(ins);
}

5555 5556
const struct wined3d_shader_backend_ops arb_program_shader_backend =
{
5557
    shader_arb_handle_instruction,
5558 5559
    shader_arb_select,
    shader_arb_select_depth_blt,
5560
    shader_arb_deselect_depth_blt,
5561 5562
    shader_arb_update_float_vertex_constants,
    shader_arb_update_float_pixel_constants,
5563
    shader_arb_load_constants,
5564
    shader_arb_load_np2fixup_constants,
5565 5566 5567
    shader_arb_destroy,
    shader_arb_alloc,
    shader_arb_free,
5568
    shader_arb_context_destroyed,
5569
    shader_arb_get_caps,
5570
    shader_arb_color_fixup_supported,
5571
};
5572 5573

/* ARB_fragment_program fixed function pipeline replacement definitions */
5574 5575 5576 5577 5578
#define ARB_FFP_CONST_TFACTOR           0
#define ARB_FFP_CONST_SPECULAR_ENABLE   ((ARB_FFP_CONST_TFACTOR) + 1)
#define ARB_FFP_CONST_CONSTANT(i)       ((ARB_FFP_CONST_SPECULAR_ENABLE) + 1 + i)
#define ARB_FFP_CONST_BUMPMAT(i)        ((ARB_FFP_CONST_CONSTANT(7)) + 1 + i)
#define ARB_FFP_CONST_LUMINANCE(i)      ((ARB_FFP_CONST_BUMPMAT(7)) + 1 + i)
5579 5580 5581

struct arbfp_ffp_desc
{
5582
    struct ffp_frag_desc parent;
5583 5584 5585 5586
    GLuint shader;
    unsigned int num_textures_used;
};

5587
/* Context activation and GL locking are done by the caller. */
5588 5589
static void arbfp_enable(BOOL enable)
{
5590 5591 5592 5593 5594 5595 5596 5597 5598
    if(enable) {
        glEnable(GL_FRAGMENT_PROGRAM_ARB);
        checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
    } else {
        glDisable(GL_FRAGMENT_PROGRAM_ARB);
        checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
    }
}

5599
static HRESULT arbfp_alloc(struct wined3d_device *device)
5600
{
5601 5602
    struct shader_arb_priv *priv;
    /* Share private data between the shader backend and the pipeline replacement, if both
5603
     * are the arb implementation. This is needed to figure out whether ARBfp should be disabled
5604 5605
     * if no pixel shader is bound or not
     */
5606 5607 5608 5609 5610 5611 5612 5613
    if (device->shader_backend == &arb_program_shader_backend)
    {
        device->fragment_priv = device->shader_priv;
    }
    else
    {
        device->fragment_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
        if (!device->fragment_priv) return E_OUTOFMEMORY;
5614
    }
5615
    priv = device->fragment_priv;
5616 5617 5618
    if (wine_rb_init(&priv->fragment_shaders, &wined3d_ffp_frag_program_rb_functions) == -1)
    {
        ERR("Failed to initialize rbtree.\n");
5619
        HeapFree(GetProcessHeap(), 0, device->fragment_priv);
5620 5621
        return E_OUTOFMEMORY;
    }
5622 5623 5624 5625
    priv->use_arbfp_fixed_func = TRUE;
    return WINED3D_OK;
}

5626
/* Context activation is done by the caller. */
5627 5628
static void arbfp_free_ffpshader(struct wine_rb_entry *entry, void *context)
{
5629
    const struct wined3d_gl_info *gl_info = context;
5630
    struct arbfp_ffp_desc *entry_arb = WINE_RB_ENTRY_VALUE(entry, struct arbfp_ffp_desc, parent.entry);
5631 5632 5633 5634 5635 5636 5637 5638

    ENTER_GL();
    GL_EXTCALL(glDeleteProgramsARB(1, &entry_arb->shader));
    checkGLcall("glDeleteProgramsARB(1, &entry_arb->shader)");
    HeapFree(GetProcessHeap(), 0, entry_arb);
    LEAVE_GL();
}

5639
/* Context activation is done by the caller. */
5640
static void arbfp_free(struct wined3d_device *device)
5641 5642
{
    struct shader_arb_priv *priv = device->fragment_priv;
5643

5644
    wine_rb_destroy(&priv->fragment_shaders, arbfp_free_ffpshader, &device->adapter->gl_info);
5645 5646
    priv->use_arbfp_fixed_func = FALSE;

5647 5648 5649
    if (device->shader_backend != &arb_program_shader_backend)
    {
        HeapFree(GetProcessHeap(), 0, device->fragment_priv);
5650 5651 5652
    }
}

5653
static void arbfp_get_caps(const struct wined3d_gl_info *gl_info, struct fragment_caps *caps)
5654
{
5655
    caps->PrimitiveMiscCaps = WINED3DPMISCCAPS_TSSARGTEMP;
5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678
    caps->TextureOpCaps =  WINED3DTEXOPCAPS_DISABLE                     |
                           WINED3DTEXOPCAPS_SELECTARG1                  |
                           WINED3DTEXOPCAPS_SELECTARG2                  |
                           WINED3DTEXOPCAPS_MODULATE4X                  |
                           WINED3DTEXOPCAPS_MODULATE2X                  |
                           WINED3DTEXOPCAPS_MODULATE                    |
                           WINED3DTEXOPCAPS_ADDSIGNED2X                 |
                           WINED3DTEXOPCAPS_ADDSIGNED                   |
                           WINED3DTEXOPCAPS_ADD                         |
                           WINED3DTEXOPCAPS_SUBTRACT                    |
                           WINED3DTEXOPCAPS_ADDSMOOTH                   |
                           WINED3DTEXOPCAPS_BLENDCURRENTALPHA           |
                           WINED3DTEXOPCAPS_BLENDFACTORALPHA            |
                           WINED3DTEXOPCAPS_BLENDTEXTUREALPHA           |
                           WINED3DTEXOPCAPS_BLENDDIFFUSEALPHA           |
                           WINED3DTEXOPCAPS_BLENDTEXTUREALPHAPM         |
                           WINED3DTEXOPCAPS_MODULATEALPHA_ADDCOLOR      |
                           WINED3DTEXOPCAPS_MODULATECOLOR_ADDALPHA      |
                           WINED3DTEXOPCAPS_MODULATEINVCOLOR_ADDALPHA   |
                           WINED3DTEXOPCAPS_MODULATEINVALPHA_ADDCOLOR   |
                           WINED3DTEXOPCAPS_DOTPRODUCT3                 |
                           WINED3DTEXOPCAPS_MULTIPLYADD                 |
                           WINED3DTEXOPCAPS_LERP                        |
5679 5680
                           WINED3DTEXOPCAPS_BUMPENVMAP                  |
                           WINED3DTEXOPCAPS_BUMPENVMAPLUMINANCE;
5681

5682
    /* TODO: Implement WINED3DTEXOPCAPS_PREMODULATE */
5683 5684

    caps->MaxTextureBlendStages   = 8;
5685
    caps->MaxSimultaneousTextures = min(gl_info->limits.fragment_samplers, 8);
5686 5687
}

5688 5689
static void state_texfactor_arbfp(struct wined3d_context *context,
        const struct wined3d_state *state, DWORD state_id)
5690
{
5691
    struct wined3d_device *device = context->swapchain->device;
5692
    const struct wined3d_gl_info *gl_info = context->gl_info;
5693 5694
    float col[4];

5695 5696
    /* Don't load the parameter if we're using an arbfp pixel shader,
     * otherwise we'll overwrite application provided constants. */
5697 5698
    if (device->shader_backend == &arb_program_shader_backend)
    {
5699 5700
        struct shader_arb_priv *priv;

5701
        if (use_ps(state)) return;
5702

5703 5704 5705
        priv = device->shader_priv;
        priv->pshader_const_dirty[ARB_FFP_CONST_TFACTOR] = 1;
        priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, ARB_FFP_CONST_TFACTOR + 1);
5706
    }
5707

5708
    D3DCOLORTOGLFLOAT4(state->render_states[WINED3D_RS_TEXTUREFACTOR], col);
5709 5710
    GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col));
    checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col)");
5711 5712
}

5713 5714
static void state_arb_specularenable(struct wined3d_context *context,
        const struct wined3d_state *state, DWORD state_id)
5715
{
5716
    struct wined3d_device *device = context->swapchain->device;
5717
    const struct wined3d_gl_info *gl_info = context->gl_info;
5718 5719
    float col[4];

5720 5721 5722
    /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
     * application provided constants
     */
5723 5724
    if (device->shader_backend == &arb_program_shader_backend)
    {
5725 5726
        struct shader_arb_priv *priv;

5727
        if (use_ps(state)) return;
5728

5729 5730 5731
        priv = device->shader_priv;
        priv->pshader_const_dirty[ARB_FFP_CONST_SPECULAR_ENABLE] = 1;
        priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, ARB_FFP_CONST_SPECULAR_ENABLE + 1);
5732
    }
5733

5734
    if (state->render_states[WINED3D_RS_SPECULARENABLE])
5735
    {
5736
        /* The specular color has no alpha */
5737 5738
        col[0] = 1.0f; col[1] = 1.0f;
        col[2] = 1.0f; col[3] = 0.0f;
5739
    } else {
5740 5741
        col[0] = 0.0f; col[1] = 0.0f;
        col[2] = 0.0f; col[3] = 0.0f;
5742 5743 5744 5745 5746
    }
    GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col));
    checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col)");
}

5747
static void set_bumpmat_arbfp(struct wined3d_context *context, const struct wined3d_state *state, DWORD state_id)
5748
{
5749
    DWORD stage = (state_id - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
5750
    struct wined3d_device *device = context->swapchain->device;
5751
    const struct wined3d_gl_info *gl_info = context->gl_info;
5752 5753
    float mat[2][2];

5754
    if (use_ps(state))
5755
    {
5756
        if (stage && (state->pixel_shader->reg_maps.bumpmat & (1 << stage)))
5757
        {
5758 5759 5760
            /* The pixel shader has to know the bump env matrix. Do a constants update if it isn't scheduled
             * anyway
             */
5761
            if (!isStateDirty(context, STATE_PIXELSHADERCONSTANT))
5762
                context_apply_state(context, state, STATE_PIXELSHADERCONSTANT);
5763
        }
5764 5765 5766 5767 5768

        if(device->shader_backend == &arb_program_shader_backend) {
            /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
            return;
        }
5769 5770 5771 5772 5773 5774
    }
    else if (device->shader_backend == &arb_program_shader_backend)
    {
        struct shader_arb_priv *priv = device->shader_priv;
        priv->pshader_const_dirty[ARB_FFP_CONST_BUMPMAT(stage)] = 1;
        priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, ARB_FFP_CONST_BUMPMAT(stage) + 1);
5775 5776
    }

5777 5778 5779 5780
    mat[0][0] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_MAT00]);
    mat[0][1] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_MAT01]);
    mat[1][0] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_MAT10]);
    mat[1][1] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_MAT11]);
5781 5782 5783 5784 5785

    GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0]));
    checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0])");
}

5786 5787
static void tex_bumpenvlum_arbfp(struct wined3d_context *context,
        const struct wined3d_state *state, DWORD state_id)
5788
{
5789
    DWORD stage = (state_id - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
5790
    struct wined3d_device *device = context->swapchain->device;
5791
    const struct wined3d_gl_info *gl_info = context->gl_info;
5792 5793
    float param[4];

5794
    if (use_ps(state))
5795
    {
5796
        if (stage && (state->pixel_shader->reg_maps.luminanceparams & (1 << stage)))
5797
        {
5798 5799 5800
            /* The pixel shader has to know the luminance offset. Do a constants update if it
             * isn't scheduled anyway
             */
5801
            if (!isStateDirty(context, STATE_PIXELSHADERCONSTANT))
5802
                context_apply_state(context, state, STATE_PIXELSHADERCONSTANT);
5803
        }
5804 5805 5806 5807 5808

        if(device->shader_backend == &arb_program_shader_backend) {
            /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
            return;
        }
5809 5810 5811 5812 5813 5814
    }
    else if (device->shader_backend == &arb_program_shader_backend)
    {
        struct shader_arb_priv *priv = device->shader_priv;
        priv->pshader_const_dirty[ARB_FFP_CONST_LUMINANCE(stage)] = 1;
        priv->highest_dirty_ps_const = max(priv->highest_dirty_ps_const, ARB_FFP_CONST_LUMINANCE(stage) + 1);
5815 5816
    }

5817 5818
    param[0] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_LSCALE]);
    param[1] = *((float *)&state->texture_states[stage][WINED3D_TSS_BUMPENV_LOFFSET]);
5819 5820
    param[2] = 0.0f;
    param[3] = 0.0f;
5821 5822 5823 5824 5825

    GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param));
    checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param)");
}

5826 5827
static const char *get_argreg(struct wined3d_shader_buffer *buffer, DWORD argnum, unsigned int stage, DWORD arg)
{
5828 5829
    const char *ret;

5830
    if(arg == ARG_UNUSED) return "unused"; /* This is the marker for unused registers */
5831 5832 5833 5834 5835 5836

    switch(arg & WINED3DTA_SELECTMASK) {
        case WINED3DTA_DIFFUSE:
            ret = "fragment.color.primary"; break;

        case WINED3DTA_CURRENT:
5837
            if (!stage) ret = "fragment.color.primary";
5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874
            else ret = "ret";
            break;

        case WINED3DTA_TEXTURE:
            switch(stage) {
                case 0: ret = "tex0"; break;
                case 1: ret = "tex1"; break;
                case 2: ret = "tex2"; break;
                case 3: ret = "tex3"; break;
                case 4: ret = "tex4"; break;
                case 5: ret = "tex5"; break;
                case 6: ret = "tex6"; break;
                case 7: ret = "tex7"; break;
                default: ret = "unknown texture";
            }
            break;

        case WINED3DTA_TFACTOR:
            ret = "tfactor"; break;

        case WINED3DTA_SPECULAR:
            ret = "fragment.color.secondary"; break;

        case WINED3DTA_TEMP:
            ret = "tempreg"; break;

        case WINED3DTA_CONSTANT:
            FIXME("Implement perstage constants\n");
            switch(stage) {
                case 0: ret = "const0"; break;
                case 1: ret = "const1"; break;
                case 2: ret = "const2"; break;
                case 3: ret = "const3"; break;
                case 4: ret = "const4"; break;
                case 5: ret = "const5"; break;
                case 6: ret = "const6"; break;
                case 7: ret = "const7"; break;
5875
                default: ret = "unknown constant";
5876
            }
5877 5878
            break;

5879 5880 5881 5882 5883 5884 5885 5886 5887
        default:
            return "unknown";
    }

    if(arg & WINED3DTA_COMPLEMENT) {
        shader_addline(buffer, "SUB arg%u, const.x, %s;\n", argnum, ret);
        if(argnum == 0) ret = "arg0";
        if(argnum == 1) ret = "arg1";
        if(argnum == 2) ret = "arg2";
5888 5889
    }
    if(arg & WINED3DTA_ALPHAREPLICATE) {
5890
        shader_addline(buffer, "MOV arg%u, %s.w;\n", argnum, ret);
5891 5892 5893
        if(argnum == 0) ret = "arg0";
        if(argnum == 1) ret = "arg1";
        if(argnum == 2) ret = "arg2";
5894 5895 5896 5897
    }
    return ret;
}

5898 5899 5900
static void gen_ffp_instr(struct wined3d_shader_buffer *buffer, unsigned int stage, BOOL color,
        BOOL alpha, DWORD dst, DWORD op, DWORD dw_arg0, DWORD dw_arg1, DWORD dw_arg2)
{
5901 5902 5903 5904 5905
    const char *dstmask, *dstreg, *arg0, *arg1, *arg2;
    unsigned int mul = 1;
    BOOL mul_final_dest = FALSE;

    if(color && alpha) dstmask = "";
5906 5907
    else if(color) dstmask = ".xyz";
    else dstmask = ".w";
5908 5909 5910 5911 5912 5913 5914 5915

    if(dst == tempreg) dstreg = "tempreg";
    else dstreg = "ret";

    arg0 = get_argreg(buffer, 0, stage, dw_arg0);
    arg1 = get_argreg(buffer, 1, stage, dw_arg1);
    arg2 = get_argreg(buffer, 2, stage, dw_arg2);

5916 5917 5918 5919 5920
    switch (op)
    {
        case WINED3D_TOP_DISABLE:
            if (!stage)
                shader_addline(buffer, "MOV %s%s, fragment.color.primary;\n", dstreg, dstmask);
5921 5922
            break;

5923
        case WINED3D_TOP_SELECT_ARG2:
5924
            arg1 = arg2;
5925
            /* FALLTHROUGH */
5926
        case WINED3D_TOP_SELECT_ARG1:
5927 5928 5929
            shader_addline(buffer, "MOV %s%s, %s;\n", dstreg, dstmask, arg1);
            break;

5930
        case WINED3D_TOP_MODULATE_4X:
5931
            mul = 2;
5932
            /* FALLTHROUGH */
5933
        case WINED3D_TOP_MODULATE_2X:
5934
            mul *= 2;
5935 5936
            if (!strcmp(dstreg, "result.color"))
            {
5937 5938 5939
                dstreg = "ret";
                mul_final_dest = TRUE;
            }
5940
            /* FALLTHROUGH */
5941
        case WINED3D_TOP_MODULATE:
5942 5943 5944
            shader_addline(buffer, "MUL %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
            break;

5945
        case WINED3D_TOP_ADD_SIGNED_2X:
5946
            mul = 2;
5947 5948
            if (!strcmp(dstreg, "result.color"))
            {
5949 5950 5951
                dstreg = "ret";
                mul_final_dest = TRUE;
            }
5952
            /* FALLTHROUGH */
5953
        case WINED3D_TOP_ADD_SIGNED:
5954 5955
            shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
            arg2 = "arg2";
5956
            /* FALLTHROUGH */
5957
        case WINED3D_TOP_ADD:
5958
            shader_addline(buffer, "ADD_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5959 5960
            break;

5961
        case WINED3D_TOP_SUBTRACT:
5962
            shader_addline(buffer, "SUB_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5963 5964
            break;

5965
        case WINED3D_TOP_ADD_SMOOTH:
5966
            shader_addline(buffer, "SUB arg1, const.x, %s;\n", arg1);
5967
            shader_addline(buffer, "MAD_SAT %s%s, arg1, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5968 5969
            break;

5970
        case WINED3D_TOP_BLEND_CURRENT_ALPHA:
5971
            arg0 = get_argreg(buffer, 0, stage, WINED3DTA_CURRENT);
5972
            shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5973
            break;
5974
        case WINED3D_TOP_BLEND_FACTOR_ALPHA:
5975
            arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TFACTOR);
5976
            shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5977
            break;
5978
        case WINED3D_TOP_BLEND_TEXTURE_ALPHA:
5979
            arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5980
            shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5981
            break;
5982
        case WINED3D_TOP_BLEND_DIFFUSE_ALPHA:
5983
            arg0 = get_argreg(buffer, 0, stage, WINED3DTA_DIFFUSE);
5984
            shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5985 5986
            break;

5987
        case WINED3D_TOP_BLEND_TEXTURE_ALPHA_PM:
5988
            arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5989 5990
            shader_addline(buffer, "SUB arg0.w, const.x, %s.w;\n", arg0);
            shader_addline(buffer, "MAD_SAT %s%s, %s, arg0.w, %s;\n", dstreg, dstmask, arg2, arg1);
5991 5992 5993 5994
            break;

        /* D3DTOP_PREMODULATE ???? */

5995
        case WINED3D_TOP_MODULATE_INVALPHA_ADD_COLOR:
5996 5997
            shader_addline(buffer, "SUB arg0.w, const.x, %s;\n", arg1);
            shader_addline(buffer, "MAD_SAT %s%s, arg0.w, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5998
            break;
5999
        case WINED3D_TOP_MODULATE_ALPHA_ADD_COLOR:
6000
            shader_addline(buffer, "MAD_SAT %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg1);
6001
            break;
6002
        case WINED3D_TOP_MODULATE_INVCOLOR_ADD_ALPHA:
6003
            shader_addline(buffer, "SUB arg0, const.x, %s;\n", arg1);
6004
            shader_addline(buffer, "MAD_SAT %s%s, arg0, %s, %s.w;\n", dstreg, dstmask, arg2, arg1);
6005
            break;
6006
        case WINED3D_TOP_MODULATE_COLOR_ADD_ALPHA:
6007
            shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s.w;\n", dstreg, dstmask, arg1, arg2, arg1);
6008 6009
            break;

6010
        case WINED3D_TOP_DOTPRODUCT3:
6011
            mul = 4;
6012 6013
            if (!strcmp(dstreg, "result.color"))
            {
6014 6015 6016 6017 6018
                dstreg = "ret";
                mul_final_dest = TRUE;
            }
            shader_addline(buffer, "SUB arg1, %s, const.w;\n", arg1);
            shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
6019
            shader_addline(buffer, "DP3_SAT %s%s, arg1, arg2;\n", dstreg, dstmask);
6020 6021
            break;

6022
        case WINED3D_TOP_MULTIPLY_ADD:
6023
            shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg0);
6024 6025
            break;

6026
        case WINED3D_TOP_LERP:
6027 6028 6029 6030
            /* The msdn is not quite right here */
            shader_addline(buffer, "LRP %s%s, %s, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
            break;

6031 6032
        case WINED3D_TOP_BUMPENVMAP:
        case WINED3D_TOP_BUMPENVMAP_LUMINANCE:
6033 6034 6035 6036 6037 6038 6039 6040
            /* Those are handled in the first pass of the shader(generation pass 1 and 2) already */
            break;

        default:
            FIXME("Unhandled texture op %08x\n", op);
    }

    if(mul == 2) {
6041
        shader_addline(buffer, "MUL_SAT %s%s, %s, const.y;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
6042
    } else if(mul == 4) {
6043
        shader_addline(buffer, "MUL_SAT %s%s, %s, const.z;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
6044 6045 6046
    }
}

6047
static GLuint gen_arbfp_ffp_shader(const struct ffp_frag_settings *settings, const struct wined3d_gl_info *gl_info)
6048
{
6049
    unsigned int stage;
6050
    struct wined3d_shader_buffer buffer;
6051 6052
    BOOL tex_read[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
    BOOL bump_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
6053
    BOOL luminance_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
6054
    UINT lowest_disabled_stage;
6055
    const char *textype;
6056
    const char *instr, *sat;
6057 6058 6059 6060
    char colorcor_dst[8];
    GLuint ret;
    DWORD arg0, arg1, arg2;
    BOOL tempreg_used = FALSE, tfactor_used = FALSE;
6061
    BOOL op_equal;
6062
    const char *final_combiner_src = "ret";
6063
    GLint pos;
6064 6065

    /* Find out which textures are read */
6066 6067 6068 6069
    for (stage = 0; stage < MAX_TEXTURES; ++stage)
    {
        if (settings->op[stage].cop == WINED3D_TOP_DISABLE)
            break;
6070 6071 6072 6073 6074 6075 6076
        arg0 = settings->op[stage].carg0 & WINED3DTA_SELECTMASK;
        arg1 = settings->op[stage].carg1 & WINED3DTA_SELECTMASK;
        arg2 = settings->op[stage].carg2 & WINED3DTA_SELECTMASK;
        if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
        if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
        if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;

6077 6078 6079 6080 6081 6082
        if (settings->op[stage].cop == WINED3D_TOP_BLEND_TEXTURE_ALPHA)
            tex_read[stage] = TRUE;
        if (settings->op[stage].cop == WINED3D_TOP_BLEND_TEXTURE_ALPHA_PM)
            tex_read[stage] = TRUE;
        if (settings->op[stage].cop == WINED3D_TOP_BUMPENVMAP)
        {
6083 6084 6085
            bump_used[stage] = TRUE;
            tex_read[stage] = TRUE;
        }
6086 6087
        if (settings->op[stage].cop == WINED3D_TOP_BUMPENVMAP_LUMINANCE)
        {
6088 6089
            bump_used[stage] = TRUE;
            tex_read[stage] = TRUE;
6090
            luminance_used[stage] = TRUE;
6091 6092 6093
        }
        else if (settings->op[stage].cop == WINED3D_TOP_BLEND_FACTOR_ALPHA)
        {
6094
            tfactor_used = TRUE;
6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105
        }

        if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
            tfactor_used = TRUE;
        }

        if(settings->op[stage].dst == tempreg) tempreg_used = TRUE;
        if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
            tempreg_used = TRUE;
        }

6106 6107
        if (settings->op[stage].aop == WINED3D_TOP_DISABLE)
            continue;
6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121
        arg0 = settings->op[stage].aarg0 & WINED3DTA_SELECTMASK;
        arg1 = settings->op[stage].aarg1 & WINED3DTA_SELECTMASK;
        arg2 = settings->op[stage].aarg2 & WINED3DTA_SELECTMASK;
        if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
        if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
        if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;

        if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
            tempreg_used = TRUE;
        }
        if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
            tfactor_used = TRUE;
        }
    }
6122
    lowest_disabled_stage = stage;
6123 6124

    /* Shader header */
6125 6126 6127 6128 6129
    if (!shader_buffer_init(&buffer))
    {
        ERR("Failed to initialize shader buffer.\n");
        return 0;
    }
6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141

    shader_addline(&buffer, "!!ARBfp1.0\n");

    switch(settings->fog) {
        case FOG_OFF:                                                         break;
        case FOG_LINEAR: shader_addline(&buffer, "OPTION ARB_fog_linear;\n"); break;
        case FOG_EXP:    shader_addline(&buffer, "OPTION ARB_fog_exp;\n");    break;
        case FOG_EXP2:   shader_addline(&buffer, "OPTION ARB_fog_exp2;\n");   break;
        default: FIXME("Unexpected fog setting %d\n", settings->fog);
    }

    shader_addline(&buffer, "PARAM const = {1, 2, 4, 0.5};\n");
6142
    shader_addline(&buffer, "TEMP TMP;\n");
6143
    shader_addline(&buffer, "TEMP ret;\n");
6144
    if(tempreg_used || settings->sRGB_write) shader_addline(&buffer, "TEMP tempreg;\n");
6145 6146 6147 6148 6149 6150 6151 6152
    shader_addline(&buffer, "TEMP arg0;\n");
    shader_addline(&buffer, "TEMP arg1;\n");
    shader_addline(&buffer, "TEMP arg2;\n");
    for(stage = 0; stage < MAX_TEXTURES; stage++) {
        if(!tex_read[stage]) continue;
        shader_addline(&buffer, "TEMP tex%u;\n", stage);
        if(!bump_used[stage]) continue;
        shader_addline(&buffer, "PARAM bumpmat%u = program.env[%u];\n", stage, ARB_FFP_CONST_BUMPMAT(stage));
6153 6154
        if(!luminance_used[stage]) continue;
        shader_addline(&buffer, "PARAM luminance%u = program.env[%u];\n", stage, ARB_FFP_CONST_LUMINANCE(stage));
6155 6156 6157 6158
    }
    if(tfactor_used) {
        shader_addline(&buffer, "PARAM tfactor = program.env[%u];\n", ARB_FFP_CONST_TFACTOR);
    }
6159
        shader_addline(&buffer, "PARAM specular_enable = program.env[%u];\n", ARB_FFP_CONST_SPECULAR_ENABLE);
6160

6161
    if(settings->sRGB_write) {
6162 6163 6164 6165
        shader_addline(&buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
                       srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
        shader_addline(&buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
                       srgb_sub_high, 0.0, 0.0, 0.0);
6166 6167
    }

6168
    if (lowest_disabled_stage < 7 && settings->emul_clipplanes)
6169
        shader_addline(&buffer, "KIL fragment.texcoord[7];\n");
6170

6171
    /* Generate texture sampling instructions) */
6172 6173 6174 6175
    for (stage = 0; stage < MAX_TEXTURES && settings->op[stage].cop != WINED3D_TOP_DISABLE; ++stage)
    {
        if (!tex_read[stage])
            continue;
6176 6177 6178 6179 6180 6181 6182 6183 6184 6185

        switch(settings->op[stage].tex_type) {
            case tex_1d:                    textype = "1D";     break;
            case tex_2d:                    textype = "2D";     break;
            case tex_3d:                    textype = "3D";     break;
            case tex_cube:                  textype = "CUBE";   break;
            case tex_rect:                  textype = "RECT";   break;
            default: textype = "unexpected_textype";   break;
        }

6186 6187
        if (settings->op[stage].cop == WINED3D_TOP_BUMPENVMAP
                || settings->op[stage].cop == WINED3D_TOP_BUMPENVMAP_LUMINANCE)
6188
            sat = "";
6189
        else
6190 6191
            sat = "_SAT";

6192 6193
        if(settings->op[stage].projected == proj_none) {
            instr = "TEX";
6194 6195
        } else if(settings->op[stage].projected == proj_count4 ||
                  settings->op[stage].projected == proj_count3) {
6196 6197
            instr = "TXP";
        } else {
6198
            FIXME("Unexpected projection mode %d\n", settings->op[stage].projected);
6199 6200 6201
            instr = "TXP";
        }

6202 6203 6204 6205
        if (stage > 0
                && (settings->op[stage - 1].cop == WINED3D_TOP_BUMPENVMAP
                || settings->op[stage - 1].cop == WINED3D_TOP_BUMPENVMAP_LUMINANCE))
        {
6206
            shader_addline(&buffer, "SWZ arg1, bumpmat%u, x, z, 0, 0;\n", stage - 1);
6207
            shader_addline(&buffer, "DP3 ret.x, arg1, tex%u;\n", stage - 1);
6208
            shader_addline(&buffer, "SWZ arg1, bumpmat%u, y, w, 0, 0;\n", stage - 1);
6209
            shader_addline(&buffer, "DP3 ret.y, arg1, tex%u;\n", stage - 1);
6210 6211

            /* with projective textures, texbem only divides the static texture coord, not the displacement,
6212
             * so multiply the displacement with the dividing parameter before passing it to TXP
6213 6214
             */
            if (settings->op[stage].projected != proj_none) {
6215
                if(settings->op[stage].projected == proj_count4) {
6216 6217
                    shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].w;\n", stage);
                    shader_addline(&buffer, "MUL ret.xyz, ret, fragment.texcoord[%u].w, fragment.texcoord[%u];\n", stage, stage);
6218
                } else {
6219 6220
                    shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].z;\n", stage);
                    shader_addline(&buffer, "MAD ret.xyz, ret, fragment.texcoord[%u].z, fragment.texcoord[%u];\n", stage, stage);
6221
                }
6222 6223 6224 6225
            } else {
                shader_addline(&buffer, "ADD ret, ret, fragment.texcoord[%u];\n", stage);
            }

6226
            shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
6227 6228 6229
                    instr, sat, stage, stage, textype);
            if (settings->op[stage - 1].cop == WINED3D_TOP_BUMPENVMAP_LUMINANCE)
            {
6230
                shader_addline(&buffer, "MAD_SAT ret.x, tex%u.z, luminance%u.x, luminance%u.y;\n",
6231
                               stage - 1, stage - 1, stage - 1);
6232
                shader_addline(&buffer, "MUL tex%u, tex%u, ret.x;\n", stage, stage);
6233
            }
6234 6235
        } else if(settings->op[stage].projected == proj_count3) {
            shader_addline(&buffer, "MOV ret, fragment.texcoord[%u];\n", stage);
6236
            shader_addline(&buffer, "MOV ret.w, ret.z;\n");
6237 6238
            shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
                            instr, sat, stage, stage, textype);
6239
        } else {
6240 6241
            shader_addline(&buffer, "%s%s tex%u, fragment.texcoord[%u], texture[%u], %s;\n",
                            instr, sat, stage, stage, stage, textype);
6242 6243 6244
        }

        sprintf(colorcor_dst, "tex%u", stage);
6245
        gen_color_correction(&buffer, colorcor_dst, WINED3DSP_WRITEMASK_ALL, "const.x", "const.y",
6246
                settings->op[stage].color_fixup);
6247 6248 6249
    }

    /* Generate the main shader */
6250 6251
    for (stage = 0; stage < MAX_TEXTURES; ++stage)
    {
6252
        if (settings->op[stage].cop == WINED3D_TOP_DISABLE)
6253
        {
6254 6255
            if (!stage)
                final_combiner_src = "fragment.color.primary";
6256 6257 6258
            break;
        }

6259 6260
        if (settings->op[stage].cop == WINED3D_TOP_SELECT_ARG1
                && settings->op[stage].aop == WINED3D_TOP_SELECT_ARG1)
6261
            op_equal = settings->op[stage].carg1 == settings->op[stage].aarg1;
6262 6263
        else if (settings->op[stage].cop == WINED3D_TOP_SELECT_ARG1
                && settings->op[stage].aop == WINED3D_TOP_SELECT_ARG2)
6264
            op_equal = settings->op[stage].carg1 == settings->op[stage].aarg2;
6265 6266
        else if (settings->op[stage].cop == WINED3D_TOP_SELECT_ARG2
                && settings->op[stage].aop == WINED3D_TOP_SELECT_ARG1)
6267
            op_equal = settings->op[stage].carg2 == settings->op[stage].aarg1;
6268 6269
        else if (settings->op[stage].cop == WINED3D_TOP_SELECT_ARG2
                && settings->op[stage].aop == WINED3D_TOP_SELECT_ARG2)
6270
            op_equal = settings->op[stage].carg2 == settings->op[stage].aarg2;
6271 6272 6273 6274 6275
        else
            op_equal = settings->op[stage].aop   == settings->op[stage].cop
                    && settings->op[stage].carg0 == settings->op[stage].aarg0
                    && settings->op[stage].carg1 == settings->op[stage].aarg1
                    && settings->op[stage].carg2 == settings->op[stage].aarg2;
6276

6277 6278
        if (settings->op[stage].aop == WINED3D_TOP_DISABLE)
        {
6279
            gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
6280 6281
                          settings->op[stage].cop, settings->op[stage].carg0,
                          settings->op[stage].carg1, settings->op[stage].carg2);
6282
            if (!stage)
6283
                shader_addline(&buffer, "MOV ret.w, fragment.color.primary.w;\n");
6284 6285 6286
        }
        else if (op_equal)
        {
6287
            gen_ffp_instr(&buffer, stage, TRUE, TRUE, settings->op[stage].dst,
6288 6289 6290
                          settings->op[stage].cop, settings->op[stage].carg0,
                          settings->op[stage].carg1, settings->op[stage].carg2);
        } else {
6291
            gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
6292 6293
                          settings->op[stage].cop, settings->op[stage].carg0,
                          settings->op[stage].carg1, settings->op[stage].carg2);
6294
            gen_ffp_instr(&buffer, stage, FALSE, TRUE, settings->op[stage].dst,
6295 6296 6297 6298 6299
                          settings->op[stage].aop, settings->op[stage].aarg0,
                          settings->op[stage].aarg1, settings->op[stage].aarg2);
        }
    }

6300
    if(settings->sRGB_write) {
6301
        shader_addline(&buffer, "MAD ret, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
6302
        arbfp_add_sRGB_correction(&buffer, "ret", "arg0", "arg1", "arg2", "tempreg", FALSE);
6303
        shader_addline(&buffer, "MOV result.color, ret;\n");
6304 6305
    } else {
        shader_addline(&buffer, "MAD result.color, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
6306
    }
6307 6308 6309 6310 6311 6312 6313

    /* Footer */
    shader_addline(&buffer, "END\n");

    /* Generate the shader */
    GL_EXTCALL(glGenProgramsARB(1, &ret));
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, ret));
6314 6315 6316
    GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
            strlen(buffer.buffer), buffer.buffer));
    checkGLcall("glProgramStringARB()");
6317

6318 6319 6320
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
    if (pos != -1)
    {
6321
        FIXME("Fragment program error at position %d: %s\n\n", pos,
6322
              debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
6323
        shader_arb_dump_program_source(buffer.buffer);
6324
    }
6325 6326 6327 6328 6329 6330 6331 6332 6333
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");
    }

6334
    shader_buffer_free(&buffer);
6335 6336 6337
    return ret;
}

6338
static void fragment_prog_arbfp(struct wined3d_context *context, const struct wined3d_state *state, DWORD state_id)
6339
{
6340
    const struct wined3d_device *device = context->swapchain->device;
6341
    const struct wined3d_gl_info *gl_info = context->gl_info;
6342
    struct shader_arb_priv *priv = device->fragment_priv;
6343 6344
    BOOL use_vshader = use_vs(state);
    BOOL use_pshader = use_ps(state);
6345
    struct ffp_frag_settings settings;
6346
    const struct arbfp_ffp_desc *desc;
6347 6348
    unsigned int i;

6349
    TRACE("context %p, state %p, state_id %#x.\n", context, state, state_id);
6350

6351
    if (isStateDirty(context, STATE_RENDER(WINED3D_RS_FOGENABLE)))
6352 6353 6354 6355 6356 6357 6358
    {
        if (!use_pshader && device->shader_backend == &arb_program_shader_backend && context->last_was_pshader)
        {
            /* Reload fixed function constants since they collide with the
             * pixel shader constants. */
            for (i = 0; i < MAX_TEXTURES; ++i)
            {
6359
                set_bumpmat_arbfp(context, state, STATE_TEXTURESTAGE(i, WINED3D_TSS_BUMPENV_MAT00));
6360
            }
6361 6362
            state_texfactor_arbfp(context, state, STATE_RENDER(WINED3D_RS_TEXTUREFACTOR));
            state_arb_specularenable(context, state, STATE_RENDER(WINED3D_RS_SPECULARENABLE));
6363
        }
6364
        else if (use_pshader && !isStateDirty(context, context->state_table[STATE_VSHADER].representative))
6365
        {
6366
            device->shader_backend->shader_select(context, use_pshader, use_vshader);
6367 6368 6369
        }
        return;
    }
6370

6371 6372 6373 6374 6375
    if (!use_pshader)
    {
        /* Find or create a shader implementing the fixed function pipeline
         * settings, then activate it. */
        gen_ffp_frag_op(device, state, &settings, FALSE);
6376
        desc = (const struct arbfp_ffp_desc *)find_ffp_frag_shader(&priv->fragment_shaders, &settings);
6377
        if(!desc) {
6378 6379 6380
            struct arbfp_ffp_desc *new_desc = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_desc));
            if (!new_desc)
            {
6381 6382 6383
                ERR("Out of memory\n");
                return;
            }
6384
            new_desc->num_textures_used = 0;
6385
            for (i = 0; i < gl_info->limits.texture_stages; ++i)
6386
            {
6387 6388
                if (settings.op[i].cop == WINED3D_TOP_DISABLE)
                    break;
6389
                new_desc->num_textures_used = i;
6390 6391
            }

6392
            memcpy(&new_desc->parent.settings, &settings, sizeof(settings));
6393
            new_desc->shader = gen_arbfp_ffp_shader(&settings, gl_info);
6394
            add_ffp_frag_shader(&priv->fragment_shaders, &new_desc->parent);
6395 6396
            TRACE("Allocated fixed function replacement shader descriptor %p\n", new_desc);
            desc = new_desc;
6397 6398
        }

6399
        /* Now activate the replacement program. GL_FRAGMENT_PROGRAM_ARB is already active (however, note the
6400 6401 6402 6403 6404
         * comment above the shader_select call below). If e.g. GLSL is active, the shader_select call will
         * deactivate it.
         */
        GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader));
        checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader)");
6405
        priv->current_fprogram_id = desc->shader;
6406

6407 6408 6409 6410 6411 6412
        if (device->shader_backend == &arb_program_shader_backend && context->last_was_pshader)
        {
            /* Reload fixed function constants since they collide with the
             * pixel shader constants. */
            for (i = 0; i < MAX_TEXTURES; ++i)
            {
6413
                set_bumpmat_arbfp(context, state, STATE_TEXTURESTAGE(i, WINED3D_TSS_BUMPENV_MAT00));
6414
            }
6415 6416
            state_texfactor_arbfp(context, state, STATE_RENDER(WINED3D_RS_TEXTUREFACTOR));
            state_arb_specularenable(context, state, STATE_RENDER(WINED3D_RS_SPECULARENABLE));
6417
        }
6418 6419 6420
        context->last_was_pshader = FALSE;
    } else {
        context->last_was_pshader = TRUE;
6421 6422 6423
    }

    /* Finally, select the shader. If a pixel shader is used, it will be set and enabled by the shader backend.
6424 6425
     * If this shader backend is arbfp(most likely), then it will simply overwrite the last fixed function
     * replacement shader. If the shader backend is not ARB, it currently is important that the opengl implementation
6426 6427 6428
     * type overwrites GL_ARB_fragment_program. This is currently the case with GLSL. If we really want to use
     * atifs or nvrc pixel shaders with arb fragment programs we'd have to disable GL_FRAGMENT_PROGRAM_ARB here
     *
6429
     * Don't call shader_select if the vertex shader is dirty, because it will be called later on by the vertex
6430
     * shader handler.
6431
     */
6432 6433
    if (!isStateDirty(context, context->state_table[STATE_VSHADER].representative))
    {
6434
        device->shader_backend->shader_select(context, use_pshader, use_vshader);
6435

6436
        if (!isStateDirty(context, STATE_VERTEXSHADERCONSTANT) && (use_vshader || use_pshader))
6437
            context_apply_state(context, state, STATE_VERTEXSHADERCONSTANT);
6438
    }
6439 6440
    if (use_pshader)
        context_apply_state(context, state, STATE_PIXELSHADERCONSTANT);
6441 6442
}

6443 6444 6445 6446 6447 6448 6449 6450
/* We can't link the fog states to the fragment state directly since the
 * vertex pipeline links them to FOGENABLE. A different linking in different
 * pipeline parts can't be expressed in the combined state table, so we need
 * to handle that with a forwarding function. The other invisible side effect
 * is that changing the fog start and fog end (which links to FOGENABLE in
 * vertex) results in the fragment_prog_arbfp function being called because
 * FOGENABLE is dirty, which calls this function here. */
static void state_arbfp_fog(struct wined3d_context *context, const struct wined3d_state *state, DWORD state_id)
6451
{
6452 6453
    enum fogsource new_source;

6454
    TRACE("context %p, state %p, state_id %#x.\n", context, state, state_id);
6455

6456 6457
    if (!isStateDirty(context, STATE_PIXELSHADER))
        fragment_prog_arbfp(context, state, state_id);
6458

6459
    if (!state->render_states[WINED3D_RS_FOGENABLE])
6460
        return;
6461

6462
    if (state->render_states[WINED3D_RS_FOGTABLEMODE] == WINED3D_FOG_NONE)
6463
    {
6464 6465
        if (use_vs(state))
        {
6466
            new_source = FOGSOURCE_VS;
6467 6468 6469
        }
        else
        {
6470
            if (state->render_states[WINED3D_RS_FOGVERTEXMODE] == WINED3D_FOG_NONE || context->last_was_rhw)
6471
                new_source = FOGSOURCE_COORD;
6472
            else
6473
                new_source = FOGSOURCE_FFP;
6474
        }
6475 6476 6477
    }
    else
    {
6478 6479
        new_source = FOGSOURCE_FFP;
    }
6480 6481 6482

    if (new_source != context->fog_source)
    {
6483
        context->fog_source = new_source;
6484
        state_fogstartend(context, state, STATE_RENDER(WINED3D_RS_FOGSTART));
6485
    }
6486 6487
}

6488
static void textransform(struct wined3d_context *context, const struct wined3d_state *state, DWORD state_id)
6489
{
6490
    if (!isStateDirty(context, STATE_PIXELSHADER))
6491
        fragment_prog_arbfp(context, state, state_id);
6492 6493
}

6494 6495 6496
static const struct StateEntryTemplate arbfp_fragmentstate_template[] =
{
    {STATE_RENDER(WINED3D_RS_TEXTUREFACTOR),              { STATE_RENDER(WINED3D_RS_TEXTUREFACTOR),             state_texfactor_arbfp   }, WINED3D_GL_EXT_NONE             },
6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(0, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(1, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(2, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(3, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(4, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(5, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(6, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_COLOR_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_COLOR_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_COLOR_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_COLOR_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_ALPHA_OP),         { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_ALPHA_ARG1),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_ALPHA_ARG2),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_ALPHA_ARG0),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_RESULT_ARG),       { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT00),    { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT00),   set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT01),    { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT10),    { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT11),    { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_MAT00),   NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_LSCALE),   { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_LSCALE),  tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_LOFFSET),  { STATE_TEXTURESTAGE(7, WINED3D_TSS_BUMPENV_LSCALE),  NULL                    }, WINED3D_GL_EXT_NONE             },
6617
    {STATE_PIXELSHADER,                                   { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6618 6619 6620 6621 6622 6623 6624 6625
    {STATE_RENDER(WINED3D_RS_FOGENABLE),                  { STATE_RENDER(WINED3D_RS_FOGENABLE),                 state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGTABLEMODE),               { STATE_RENDER(WINED3D_RS_FOGENABLE),                 NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGVERTEXMODE),              { STATE_RENDER(WINED3D_RS_FOGENABLE),                 NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGSTART),                   { STATE_RENDER(WINED3D_RS_FOGSTART),                  state_fogstartend       }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGEND),                     { STATE_RENDER(WINED3D_RS_FOGSTART),                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_SRGBWRITEENABLE),            { STATE_PIXELSHADER,                                  NULL                    }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGCOLOR),                   { STATE_RENDER(WINED3D_RS_FOGCOLOR),                  state_fogcolor          }, WINED3D_GL_EXT_NONE             },
    {STATE_RENDER(WINED3D_RS_FOGDENSITY),                 { STATE_RENDER(WINED3D_RS_FOGDENSITY),                state_fogdensity        }, WINED3D_GL_EXT_NONE             },
6626 6627 6628 6629 6630 6631 6632 6633
    {STATE_TEXTURESTAGE(0,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(0, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(1,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(1, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(2,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(2, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(3,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(3, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(4,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(4, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(5,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(5, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(6,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(6, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
    {STATE_TEXTURESTAGE(7,WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), {STATE_TEXTURESTAGE(7, WINED3D_TSS_TEXTURE_TRANSFORM_FLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6634
    {STATE_RENDER(WINED3D_RS_SPECULARENABLE),             { STATE_RENDER(WINED3D_RS_SPECULARENABLE),            state_arb_specularenable}, WINED3D_GL_EXT_NONE             },
6635
    {0 /* Terminate */,                                   { 0,                                                  0                       }, WINED3D_GL_EXT_NONE             },
6636 6637 6638 6639 6640 6641 6642
};

const struct fragment_pipeline arbfp_fragment_pipeline = {
    arbfp_enable,
    arbfp_get_caps,
    arbfp_alloc,
    arbfp_free,
6643
    shader_arb_color_fixup_supported,
6644 6645
    arbfp_fragmentstate_template,
    TRUE /* We can disable projected textures */
6646
};
6647 6648 6649 6650

struct arbfp_blit_priv {
    GLenum yuy2_rect_shader, yuy2_2d_shader;
    GLenum uyvy_rect_shader, uyvy_2d_shader;
6651
    GLenum yv12_rect_shader, yv12_2d_shader;
6652
    GLenum p8_rect_shader, p8_2d_shader;
6653
    GLuint palette_texture;
6654 6655
};

6656
static HRESULT arbfp_blit_alloc(struct wined3d_device *device)
6657
{
6658 6659 6660 6661 6662 6663 6664
    device->blit_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct arbfp_blit_priv));
    if(!device->blit_priv) {
        ERR("Out of memory\n");
        return E_OUTOFMEMORY;
    }
    return WINED3D_OK;
}
6665 6666

/* Context activation is done by the caller. */
6667
static void arbfp_blit_free(struct wined3d_device *device)
6668
{
6669
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
6670
    struct arbfp_blit_priv *priv = device->blit_priv;
6671 6672 6673 6674 6675 6676

    ENTER_GL();
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_rect_shader));
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_2d_shader));
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_rect_shader));
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_2d_shader));
6677 6678
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_rect_shader));
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_2d_shader));
6679 6680 6681
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->p8_rect_shader));
    GL_EXTCALL(glDeleteProgramsARB(1, &priv->p8_2d_shader));
    checkGLcall("Delete yuv and p8 programs");
6682 6683

    if(priv->palette_texture) glDeleteTextures(1, &priv->palette_texture);
6684
    LEAVE_GL();
6685 6686 6687

    HeapFree(GetProcessHeap(), 0, device->blit_priv);
    device->blit_priv = NULL;
6688 6689
}

6690
static BOOL gen_planar_yuv_read(struct wined3d_shader_buffer *buffer, enum complex_fixup fixup,
6691
        GLenum textype, char *luminance)
6692
{
6693
    char chroma;
6694 6695
    const char *tex, *texinstr;

6696
    if (fixup == COMPLEX_FIXUP_UYVY) {
6697 6698
        chroma = 'x';
        *luminance = 'w';
6699
    } else {
6700 6701
        chroma = 'w';
        *luminance = 'x';
6702
    }
6703 6704 6705 6706 6707 6708 6709 6710
    switch(textype) {
        case GL_TEXTURE_2D:             tex = "2D";     texinstr = "TXP"; break;
        case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   texinstr = "TEX"; break;
        default:
            /* This is more tricky than just replacing the texture type - we have to navigate
             * properly in the texture to find the correct chroma values
             */
            FIXME("Implement yuv correction for non-2d, non-rect textures\n");
6711
            return FALSE;
6712 6713
    }

6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726
    /* First we have to read the chroma values. This means we need at least two pixels(no filtering),
     * or 4 pixels(with filtering). To get the unmodified chromas, we have to rid ourselves of the
     * filtering when we sample the texture.
     *
     * These are the rules for reading the chroma:
     *
     * Even pixel: Cr
     * Even pixel: U
     * Odd pixel: V
     *
     * So we have to get the sampling x position in non-normalized coordinates in integers
     */
    if(textype != GL_TEXTURE_RECTANGLE_ARB) {
6727 6728
        shader_addline(buffer, "MUL texcrd.xy, fragment.texcoord[0], size.x;\n");
        shader_addline(buffer, "MOV texcrd.w, size.x;\n");
6729
    } else {
6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749
        shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
    }
    /* We must not allow filtering between pixel x and x+1, this would mix U and V
     * Vertical filtering is ok. However, bear in mind that the pixel center is at
     * 0.5, so add 0.5.
     */
    shader_addline(buffer, "FLR texcrd.x, texcrd.x;\n");
    shader_addline(buffer, "ADD texcrd.x, texcrd.x, coef.y;\n");

    /* Divide the x coordinate by 0.5 and get the fraction. This gives 0.25 and 0.75 for the
     * even and odd pixels respectively
     */
    shader_addline(buffer, "MUL texcrd2, texcrd, coef.y;\n");
    shader_addline(buffer, "FRC texcrd2, texcrd2;\n");

    /* Sample Pixel 1 */
    shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);

    /* Put the value into either of the chroma values */
    shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
6750
    shader_addline(buffer, "MUL chroma.x, luminance.%c, temp.x;\n", chroma);
6751
    shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
6752
    shader_addline(buffer, "MUL chroma.y, luminance.%c, temp.x;\n", chroma);
6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763

    /* Sample pixel 2. If we read an even pixel(SLT above returned 1), sample
     * the pixel right to the current one. Otherwise, sample the left pixel.
     * Bias and scale the SLT result to -1;1 and add it to the texcrd.x.
     */
    shader_addline(buffer, "MAD temp.x, temp.x, coef.z, -coef.x;\n");
    shader_addline(buffer, "ADD texcrd.x, texcrd, temp.x;\n");
    shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);

    /* Put the value into the other chroma */
    shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
6764
    shader_addline(buffer, "MAD chroma.y, luminance.%c, temp.x, chroma.y;\n", chroma);
6765
    shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
6766
    shader_addline(buffer, "MAD chroma.x, luminance.%c, temp.x, chroma.x;\n", chroma);
6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777

    /* TODO: If filtering is enabled, sample a 2nd pair of pixels left or right of
     * the current one and lerp the two U and V values
     */

    /* This gives the correctly filtered luminance value */
    shader_addline(buffer, "TEX luminance, fragment.texcoord[0], texture[0], %s;\n", tex);

    return TRUE;
}

6778
static BOOL gen_yv12_read(struct wined3d_shader_buffer *buffer, GLenum textype, char *luminance)
6779
{
6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828
    const char *tex;

    switch(textype) {
        case GL_TEXTURE_2D:             tex = "2D";     break;
        case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   break;
        default:
            FIXME("Implement yv12 correction for non-2d, non-rect textures\n");
            return FALSE;
    }

    /* YV12 surfaces contain a WxH sized luminance plane, followed by a (W/2)x(H/2)
     * V and a (W/2)x(H/2) U plane, each with 8 bit per pixel. So the effective
     * bitdepth is 12 bits per pixel. Since the U and V planes have only half the
     * pitch of the luminance plane, the packing into the gl texture is a bit
     * unfortunate. If the whole texture is interpreted as luminance data it looks
     * approximately like this:
     *
     *        +----------------------------------+----
     *        |                                  |
     *        |                                  |
     *        |                                  |
     *        |                                  |
     *        |                                  |   2
     *        |            LUMINANCE             |   -
     *        |                                  |   3
     *        |                                  |
     *        |                                  |
     *        |                                  |
     *        |                                  |
     *        +----------------+-----------------+----
     *        |                |                 |
     *        |  U even rows   |  U odd rows     |
     *        |                |                 |   1
     *        +----------------+------------------   -
     *        |                |                 |   3
     *        |  V even rows   |  V odd rows     |
     *        |                |                 |
     *        +----------------+-----------------+----
     *        |                |                 |
     *        |     0.5        |       0.5       |
     *
     * So it appears as if there are 4 chroma images, but in fact the odd rows
     * in the chroma images are in the same row as the even ones. So its is
     * kinda tricky to read
     *
     * When reading from rectangle textures, keep in mind that the input y coordinates
     * go from 0 to d3d_height, whereas the opengl texture height is 1.5 * d3d_height
     */
    shader_addline(buffer, "PARAM yv12_coef = {%f, %f, %f, %f};\n",
6829
            2.0f / 3.0f, 1.0f / 6.0f, (2.0f / 3.0f) + (1.0f / 6.0f), 1.0f / 3.0f);
6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885

    shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
    /* the chroma planes have only half the width */
    shader_addline(buffer, "MUL texcrd.x, texcrd.x, coef.y;\n");

    /* The first value is between 2/3 and 5/6th of the texture's height, so scale+bias
     * the coordinate. Also read the right side of the image when reading odd lines
     *
     * Don't forget to clamp the y values in into the range, otherwise we'll get filtering
     * bleeding
     */
    if(textype == GL_TEXTURE_2D) {

        shader_addline(buffer, "RCP chroma.w, size.y;\n");

        shader_addline(buffer, "MUL texcrd2.y, texcrd.y, size.y;\n");

        shader_addline(buffer, "FLR texcrd2.y, texcrd2.y;\n");
        shader_addline(buffer, "MAD texcrd.y, texcrd.y, yv12_coef.y, yv12_coef.x;\n");

        /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
        shader_addline(buffer, "ADD texcrd2.x, texcrd2.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
        shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
        shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
        shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");

        /* clamp, keep the half pixel origin in mind */
        shader_addline(buffer, "MAD temp.y, coef.y, chroma.w, yv12_coef.x;\n");
        shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
        shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.z;\n");
        shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
    } else {
        /* Read from [size - size+size/4] */
        shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
        shader_addline(buffer, "MAD texcrd.y, texcrd.y, coef.w, size.y;\n");

        /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
        shader_addline(buffer, "ADD texcrd2.x, texcrd.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
        shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
        shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
        shader_addline(buffer, "MUL texcrd2.x, texcrd2.x, size.x;\n");
        shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");

        /* Make sure to read exactly from the pixel center */
        shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
        shader_addline(buffer, "ADD texcrd.y, texcrd.y, coef.y;\n");

        /* Clamp */
        shader_addline(buffer, "MAD temp.y, size.y, coef.w, size.y;\n");
        shader_addline(buffer, "ADD temp.y, temp.y, -coef.y;\n");
        shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
        shader_addline(buffer, "ADD temp.y, size.y, -coef.y;\n");
        shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
    }
    /* Read the texture, put the result into the output register */
    shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
6886
    shader_addline(buffer, "MOV chroma.x, temp.w;\n");
6887 6888 6889 6890 6891 6892 6893 6894 6895 6896

    /* The other chroma value is 1/6th of the texture lower, from 5/6th to 6/6th
     * No need to clamp because we're just reusing the already clamped value from above
     */
    if(textype == GL_TEXTURE_2D) {
        shader_addline(buffer, "ADD texcrd.y, texcrd.y, yv12_coef.y;\n");
    } else {
        shader_addline(buffer, "MAD texcrd.y, size.y, coef.w, texcrd.y;\n");
    }
    shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
6897
    shader_addline(buffer, "MOV chroma.y, temp.w;\n");
6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910

    /* Sample the luminance value. It is in the top 2/3rd of the texture, so scale the y coordinate.
     * Clamp the y coordinate to prevent the chroma values from bleeding into the sampled luminance
     * values due to filtering
     */
    shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
    if(textype == GL_TEXTURE_2D) {
        /* Multiply the y coordinate by 2/3 and clamp it */
        shader_addline(buffer, "MUL texcrd.y, texcrd.y, yv12_coef.x;\n");
        shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.x;\n");
        shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
        shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
    } else {
6911
        /* Reading from texture_rectangles is pretty straightforward, just use the unmodified
6912 6913 6914 6915 6916 6917
         * texture coordinate. It is still a good idea to clamp it though, since the opengl texture
         * is bigger
         */
        shader_addline(buffer, "ADD temp.x, size.y, -coef.y;\n");
        shader_addline(buffer, "MIN texcrd.y, texcrd.y, size.x;\n");
        shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
6918
    }
6919 6920 6921 6922 6923
    *luminance = 'a';

    return TRUE;
}

6924 6925
static GLuint gen_p8_shader(struct arbfp_blit_priv *priv,
        const struct wined3d_gl_info *gl_info, GLenum textype)
6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989
{
    GLenum shader;
    struct wined3d_shader_buffer buffer;
    GLint pos;

    /* Shader header */
    if (!shader_buffer_init(&buffer))
    {
        ERR("Failed to initialize shader buffer.\n");
        return 0;
    }

    ENTER_GL();
    GL_EXTCALL(glGenProgramsARB(1, &shader));
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
    LEAVE_GL();
    if(!shader) {
        shader_buffer_free(&buffer);
        return 0;
    }

    shader_addline(&buffer, "!!ARBfp1.0\n");
    shader_addline(&buffer, "TEMP index;\n");

    /* { 255/256, 0.5/255*255/256, 0, 0 } */
    shader_addline(&buffer, "PARAM constants = { 0.996, 0.00195, 0, 0 };\n");

    /* The alpha-component contains the palette index */
    if(textype == GL_TEXTURE_RECTANGLE_ARB)
        shader_addline(&buffer, "TXP index, fragment.texcoord[0], texture[0], RECT;\n");
    else
        shader_addline(&buffer, "TEX index, fragment.texcoord[0], texture[0], 2D;\n");

    /* Scale the index by 255/256 and add a bias of '0.5' in order to sample in the middle */
    shader_addline(&buffer, "MAD index.a, index.a, constants.x, constants.y;\n");

    /* Use the alpha-component as an index in the palette to get the final color */
    shader_addline(&buffer, "TEX result.color, index.a, texture[1], 1D;\n");
    shader_addline(&buffer, "END\n");

    ENTER_GL();
    GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
            strlen(buffer.buffer), buffer.buffer));
    checkGLcall("glProgramStringARB()");

    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
    if (pos != -1)
    {
        FIXME("Fragment program error at position %d: %s\n\n", pos,
              debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
        shader_arb_dump_program_source(buffer.buffer);
    }

    if (textype == GL_TEXTURE_RECTANGLE_ARB)
        priv->p8_rect_shader = shader;
    else
        priv->p8_2d_shader = shader;

    shader_buffer_free(&buffer);
    LEAVE_GL();

    return shader;
}

6990
/* Context activation is done by the caller. */
6991
static void upload_palette(const struct wined3d_surface *surface, struct wined3d_context *context)
6992 6993
{
    BYTE table[256][4];
6994
    struct wined3d_device *device = surface->resource.device;
6995
    const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
6996 6997 6998 6999 7000 7001
    struct arbfp_blit_priv *priv = device->blit_priv;
    BOOL colorkey = (surface->CKeyFlags & WINEDDSD_CKSRCBLT) ? TRUE : FALSE;

    d3dfmt_p8_init_palette(surface, table, colorkey);

    ENTER_GL();
7002 7003 7004 7005 7006 7007 7008

    if (gl_info->supported[APPLE_CLIENT_STORAGE])
    {
        glPixelStorei(GL_UNPACK_CLIENT_STORAGE_APPLE, GL_FALSE);
        checkGLcall("glPixelStorei(GL_UNPACK_CLIENT_STORAGE_APPLE, GL_FALSE)");
    }

7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021
    if (!priv->palette_texture)
        glGenTextures(1, &priv->palette_texture);

    GL_EXTCALL(glActiveTextureARB(GL_TEXTURE1));
    glBindTexture(GL_TEXTURE_1D, priv->palette_texture);

    glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);

    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    /* Make sure we have discrete color levels. */
    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    /* Upload the palette */
7022
    /* TODO: avoid unneeded uploads in the future by adding some SFLAG_PALETTE_DIRTY mechanism */
7023 7024
    glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA, 256, 0, GL_RGBA, GL_UNSIGNED_BYTE, table);

7025 7026 7027 7028 7029 7030
    if (gl_info->supported[APPLE_CLIENT_STORAGE])
    {
        glPixelStorei(GL_UNPACK_CLIENT_STORAGE_APPLE, GL_TRUE);
        checkGLcall("glPixelStorei(GL_UNPACK_CLIENT_STORAGE_APPLE, GL_TRUE)");
    }

7031
    /* Switch back to unit 0 in which the 2D texture will be stored. */
7032
    context_active_texture(context, gl_info, 0);
7033 7034 7035
    LEAVE_GL();
}

7036
/* Context activation is done by the caller. */
7037 7038
static GLuint gen_yuv_shader(struct arbfp_blit_priv *priv, const struct wined3d_gl_info *gl_info,
        enum complex_fixup yuv_fixup, GLenum textype)
7039
{
7040
    GLenum shader;
7041
    struct wined3d_shader_buffer buffer;
7042
    char luminance_component;
7043
    GLint pos;
7044 7045

    /* Shader header */
7046 7047 7048 7049 7050
    if (!shader_buffer_init(&buffer))
    {
        ERR("Failed to initialize shader buffer.\n");
        return 0;
    }
7051

7052
    ENTER_GL();
7053 7054 7055 7056
    GL_EXTCALL(glGenProgramsARB(1, &shader));
    checkGLcall("GL_EXTCALL(glGenProgramsARB(1, &shader))");
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
    checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
7057
    LEAVE_GL();
7058
    if(!shader) {
7059
        shader_buffer_free(&buffer);
7060 7061
        return 0;
    }
7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072

    /* The YUY2 and UYVY formats contain two pixels packed into a 32 bit macropixel,
     * giving effectively 16 bit per pixel. The color consists of a luminance(Y) and
     * two chroma(U and V) values. Each macropixel has two luminance values, one for
     * each single pixel it contains, and one U and one V value shared between both
     * pixels.
     *
     * The data is loaded into an A8L8 texture. With YUY2, the luminance component
     * contains the luminance and alpha the chroma. With UYVY it is vice versa. Thus
     * take the format into account when generating the read swizzles
     *
7073
     * Reading the Y value is straightforward - just sample the texture. The hardware
7074 7075 7076 7077 7078 7079 7080 7081
     * takes care of filtering in the horizontal and vertical direction.
     *
     * Reading the U and V values is harder. We have to avoid filtering horizontally,
     * because that would mix the U and V values of one pixel or two adjacent pixels.
     * Thus floor the texture coordinate and add 0.5 to get an unfiltered read,
     * regardless of the filtering setting. Vertical filtering works automatically
     * though - the U and V values of two rows are mixed nicely.
     *
7082
     * Apart of avoiding filtering issues, the code has to know which value it just
7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103
     * read, and where it can find the other one. To determine this, it checks if
     * it sampled an even or odd pixel, and shifts the 2nd read accordingly.
     *
     * Handling horizontal filtering of U and V values requires reading a 2nd pair
     * of pixels, extracting U and V and mixing them. This is not implemented yet.
     *
     * An alternative implementation idea is to load the texture as A8R8G8B8 texture,
     * with width / 2. This way one read gives all 3 values, finding U and V is easy
     * in an unfiltered situation. Finding the luminance on the other hand requires
     * finding out if it is an odd or even pixel. The real drawback of this approach
     * is filtering. This would have to be emulated completely in the shader, reading
     * up two 2 packed pixels in up to 2 rows and interpolating both horizontally and
     * vertically. Beyond that it would require adjustments to the texture handling
     * code to deal with the width scaling
     */
    shader_addline(&buffer, "!!ARBfp1.0\n");
    shader_addline(&buffer, "TEMP luminance;\n");
    shader_addline(&buffer, "TEMP temp;\n");
    shader_addline(&buffer, "TEMP chroma;\n");
    shader_addline(&buffer, "TEMP texcrd;\n");
    shader_addline(&buffer, "TEMP texcrd2;\n");
7104
    shader_addline(&buffer, "PARAM coef = {1.0, 0.5, 2.0, 0.25};\n");
7105 7106 7107
    shader_addline(&buffer, "PARAM yuv_coef = {1.403, 0.344, 0.714, 1.770};\n");
    shader_addline(&buffer, "PARAM size = program.local[0];\n");

7108 7109
    switch (yuv_fixup)
    {
7110 7111
        case COMPLEX_FIXUP_UYVY:
        case COMPLEX_FIXUP_YUY2:
7112 7113
            if (!gen_planar_yuv_read(&buffer, yuv_fixup, textype, &luminance_component))
            {
7114
                shader_buffer_free(&buffer);
7115 7116 7117 7118
                return 0;
            }
            break;

7119
        case COMPLEX_FIXUP_YV12:
7120 7121
            if (!gen_yv12_read(&buffer, textype, &luminance_component))
            {
7122
                shader_buffer_free(&buffer);
7123 7124 7125 7126 7127 7128
                return 0;
            }
            break;

        default:
            FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
7129
            shader_buffer_free(&buffer);
7130
            return 0;
7131 7132 7133 7134 7135 7136
    }

    /* Calculate the final result. Formula is taken from
     * http://www.fourcc.org/fccyvrgb.php. Note that the chroma
     * ranges from -0.5 to 0.5
     */
7137
    shader_addline(&buffer, "SUB chroma.xy, chroma, coef.y;\n");
7138

7139 7140 7141 7142
    shader_addline(&buffer, "MAD result.color.x, chroma.x, yuv_coef.x, luminance.%c;\n", luminance_component);
    shader_addline(&buffer, "MAD temp.x, -chroma.y, yuv_coef.y, luminance.%c;\n", luminance_component);
    shader_addline(&buffer, "MAD result.color.y, -chroma.x, yuv_coef.z, temp.x;\n");
    shader_addline(&buffer, "MAD result.color.z, chroma.y, yuv_coef.w, luminance.%c;\n", luminance_component);
7143 7144
    shader_addline(&buffer, "END\n");

7145
    ENTER_GL();
7146 7147 7148
    GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
            strlen(buffer.buffer), buffer.buffer));
    checkGLcall("glProgramStringARB()");
7149

7150 7151 7152
    glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
    if (pos != -1)
    {
7153
        FIXME("Fragment program error at position %d: %s\n\n", pos,
7154
              debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
7155
        shader_arb_dump_program_source(buffer.buffer);
7156
    }
7157 7158 7159 7160 7161 7162 7163 7164 7165
    else
    {
        GLint native;

        GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
        checkGLcall("glGetProgramivARB()");
        if (!native) WARN("Program exceeds native resource limits.\n");
    }

7166
    shader_buffer_free(&buffer);
7167
    LEAVE_GL();
7168

7169 7170
    switch (yuv_fixup)
    {
7171
        case COMPLEX_FIXUP_YUY2:
7172 7173 7174 7175
            if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yuy2_rect_shader = shader;
            else priv->yuy2_2d_shader = shader;
            break;

7176
        case COMPLEX_FIXUP_UYVY:
7177 7178 7179 7180
            if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->uyvy_rect_shader = shader;
            else priv->uyvy_2d_shader = shader;
            break;

7181
        case COMPLEX_FIXUP_YV12:
7182 7183 7184
            if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yv12_rect_shader = shader;
            else priv->yv12_2d_shader = shader;
            break;
7185 7186
        default:
            ERR("Unsupported complex fixup: %d\n", yuv_fixup);
7187
    }
7188

7189 7190 7191
    return shader;
}

7192
/* Context activation is done by the caller. */
7193
static HRESULT arbfp_blit_set(void *blit_priv, struct wined3d_context *context, const struct wined3d_surface *surface)
7194
{
7195
    GLenum shader;
7196
    float size[4] = {(float) surface->pow2Width, (float) surface->pow2Height, 1.0f, 1.0f};
7197
    struct arbfp_blit_priv *priv = blit_priv;
7198
    enum complex_fixup fixup;
7199
    GLenum textype = surface->texture_target;
7200
    const struct wined3d_gl_info *gl_info = context->gl_info;
7201

7202 7203 7204 7205 7206 7207 7208 7209 7210
    if (surface->flags & SFLAG_CONVERTED)
    {
        ENTER_GL();
        glEnable(textype);
        checkGLcall("glEnable(textype)");
        LEAVE_GL();
        return WINED3D_OK;
    }

7211
    if (!is_complex_fixup(surface->resource.format->color_fixup))
7212 7213
    {
        TRACE("Fixup:\n");
7214
        dump_color_fixup_desc(surface->resource.format->color_fixup);
7215
        /* Don't bother setting up a shader for unconverted formats */
7216
        ENTER_GL();
7217 7218
        glEnable(textype);
        checkGLcall("glEnable(textype)");
7219
        LEAVE_GL();
7220 7221 7222
        return WINED3D_OK;
    }

7223
    fixup = get_complex_fixup(surface->resource.format->color_fixup);
7224

7225
    switch(fixup)
7226
    {
7227
        case COMPLEX_FIXUP_YUY2:
7228 7229 7230
            shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yuy2_rect_shader : priv->yuy2_2d_shader;
            break;

7231
        case COMPLEX_FIXUP_UYVY:
7232 7233 7234
            shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->uyvy_rect_shader : priv->uyvy_2d_shader;
            break;

7235
        case COMPLEX_FIXUP_YV12:
7236 7237 7238
            shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yv12_rect_shader : priv->yv12_2d_shader;
            break;

7239 7240
        case COMPLEX_FIXUP_P8:
            shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->p8_rect_shader : priv->p8_2d_shader;
7241
            if (!shader) shader = gen_p8_shader(priv, gl_info, textype);
7242

7243
            upload_palette(surface, context);
7244 7245
            break;

7246
        default:
7247
            FIXME("Unsupported complex fixup %#x, not setting a shader\n", fixup);
7248 7249 7250 7251 7252
            ENTER_GL();
            glEnable(textype);
            checkGLcall("glEnable(textype)");
            LEAVE_GL();
            return E_NOTIMPL;
7253 7254
    }

7255
    if (!shader) shader = gen_yuv_shader(priv, gl_info, fixup, textype);
7256

7257
    ENTER_GL();
7258 7259 7260 7261 7262 7263
    glEnable(GL_FRAGMENT_PROGRAM_ARB);
    checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
    GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
    checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
    GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, size));
    checkGLcall("glProgramLocalParameter4fvARB");
7264
    LEAVE_GL();
7265 7266 7267 7268

    return WINED3D_OK;
}

7269
/* Context activation is done by the caller. */
7270 7271
static void arbfp_blit_unset(const struct wined3d_gl_info *gl_info)
{
7272
    ENTER_GL();
7273 7274 7275 7276
    glDisable(GL_FRAGMENT_PROGRAM_ARB);
    checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
    glDisable(GL_TEXTURE_2D);
    checkGLcall("glDisable(GL_TEXTURE_2D)");
7277 7278
    if (gl_info->supported[ARB_TEXTURE_CUBE_MAP])
    {
7279 7280 7281
        glDisable(GL_TEXTURE_CUBE_MAP_ARB);
        checkGLcall("glDisable(GL_TEXTURE_CUBE_MAP_ARB)");
    }
7282 7283
    if (gl_info->supported[ARB_TEXTURE_RECTANGLE])
    {
7284 7285 7286
        glDisable(GL_TEXTURE_RECTANGLE_ARB);
        checkGLcall("glDisable(GL_TEXTURE_RECTANGLE_ARB)");
    }
7287
    LEAVE_GL();
7288 7289
}

7290
static BOOL arbfp_blit_supported(const struct wined3d_gl_info *gl_info, enum wined3d_blit_op blit_op,
7291 7292
        const RECT *src_rect, DWORD src_usage, enum wined3d_pool src_pool, const struct wined3d_format *src_format,
        const RECT *dst_rect, DWORD dst_usage, enum wined3d_pool dst_pool, const struct wined3d_format *dst_format)
7293
{
7294
    enum complex_fixup src_fixup;
7295

7296 7297 7298
    if (!gl_info->supported[ARB_FRAGMENT_PROGRAM])
        return FALSE;

7299
    if (blit_op != WINED3D_BLIT_OP_COLOR_BLIT)
7300 7301 7302
    {
        TRACE("Unsupported blit_op=%d\n", blit_op);
        return FALSE;
7303 7304
    }

7305
    if (src_pool == WINED3D_POOL_SYSTEM_MEM || dst_pool == WINED3D_POOL_SYSTEM_MEM)
7306 7307
        return FALSE;

7308
    src_fixup = get_complex_fixup(src_format->color_fixup);
7309 7310 7311
    if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
    {
        TRACE("Checking support for fixup:\n");
7312
        dump_color_fixup_desc(src_format->color_fixup);
7313 7314
    }

7315
    if (!is_identity_fixup(dst_format->color_fixup))
7316 7317 7318 7319 7320
    {
        TRACE("Destination fixups are not supported\n");
        return FALSE;
    }

7321
    if (is_identity_fixup(src_format->color_fixup))
7322 7323 7324 7325 7326
    {
        TRACE("[OK]\n");
        return TRUE;
    }

7327
     /* We only support YUV conversions. */
7328
    if (!is_complex_fixup(src_format->color_fixup))
7329 7330 7331 7332 7333
    {
        TRACE("[FAILED]\n");
        return FALSE;
    }

7334
    switch(src_fixup)
7335
    {
7336 7337 7338
        case COMPLEX_FIXUP_YUY2:
        case COMPLEX_FIXUP_UYVY:
        case COMPLEX_FIXUP_YV12:
7339
        case COMPLEX_FIXUP_P8:
7340 7341
            TRACE("[OK]\n");
            return TRUE;
7342

7343
        default:
7344
            FIXME("Unsupported YUV fixup %#x\n", src_fixup);
7345 7346 7347 7348 7349
            TRACE("[FAILED]\n");
            return FALSE;
    }
}

7350
HRESULT arbfp_blit_surface(struct wined3d_device *device, DWORD filter,
7351
        struct wined3d_surface *src_surface, const RECT *src_rect_in,
7352
        struct wined3d_surface *dst_surface, const RECT *dst_rect_in)
7353 7354
{
    struct wined3d_context *context;
7355
    RECT src_rect = *src_rect_in;
7356 7357 7358
    RECT dst_rect = *dst_rect_in;

    /* Now load the surface */
7359 7360 7361
    if (wined3d_settings.offscreen_rendering_mode != ORM_FBO
            && (src_surface->flags & (SFLAG_INTEXTURE | SFLAG_INDRAWABLE)) == SFLAG_INDRAWABLE)
    {
7362
        /* Without FBO blits transferring from the drawable to the texture is
7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373
         * expensive, because we have to flip the data in sysmem. Since we can
         * flip in the blitter, we don't actually need that flip anyway. So we
         * use the surface's texture as scratch texture, and flip the source
         * rectangle instead. */
        surface_load_fb_texture(src_surface, FALSE);

        src_rect.top = src_surface->resource.height - src_rect.top;
        src_rect.bottom = src_surface->resource.height - src_rect.bottom;
    }
    else
        surface_internal_preload(src_surface, SRGB_RGB);
7374 7375

    /* Activate the destination context, set it up for blitting */
7376 7377
    context = context_acquire(device, dst_surface);
    context_apply_blit_state(context, device);
7378

7379
    if (!surface_is_offscreen(dst_surface))
7380
        surface_translate_drawable_coords(dst_surface, context->win_handle, &dst_rect);
7381

7382
    arbfp_blit_set(device->blit_priv, context, src_surface);
7383 7384 7385 7386

    ENTER_GL();

    /* Draw a textured quad */
7387
    draw_textured_quad(src_surface, context, &src_rect, &dst_rect, filter);
7388 7389 7390 7391

    LEAVE_GL();

    /* Leave the opengl state valid for blitting */
7392
    arbfp_blit_unset(context->gl_info);
7393

7394 7395 7396
    if (wined3d_settings.strict_draw_ordering
            || (dst_surface->container.type == WINED3D_CONTAINER_SWAPCHAIN
            && (dst_surface->container.u.swapchain->front_buffer == dst_surface)))
7397
        wglFlush(); /* Flush to ensure ordering across contexts. */
7398 7399 7400

    context_release(context);

7401
    surface_modify_location(dst_surface, dst_surface->draw_binding, TRUE);
7402 7403 7404
    return WINED3D_OK;
}

7405
/* Do not call while under the GL lock. */
7406
static HRESULT arbfp_blit_color_fill(struct wined3d_device *device, struct wined3d_surface *dst_surface,
7407
        const RECT *dst_rect, const struct wined3d_color *color)
7408 7409 7410 7411 7412
{
    FIXME("Color filling not implemented by arbfp_blit\n");
    return WINED3DERR_INVALIDCALL;
}

7413
/* Do not call while under the GL lock. */
7414
static HRESULT arbfp_blit_depth_fill(struct wined3d_device *device,
7415
        struct wined3d_surface *surface, const RECT *rect, float depth)
7416 7417 7418 7419 7420
{
    FIXME("Depth filling not implemented by arbfp_blit.\n");
    return WINED3DERR_INVALIDCALL;
}

7421 7422 7423 7424 7425
const struct blit_shader arbfp_blit = {
    arbfp_blit_alloc,
    arbfp_blit_free,
    arbfp_blit_set,
    arbfp_blit_unset,
7426
    arbfp_blit_supported,
7427 7428
    arbfp_blit_color_fill,
    arbfp_blit_depth_fill,
7429
};