vcomp: Implement C2VectParallel().

Used by Ancient Cities. Signed-off-by: Paul Gofman <pgofman@codeweavers.com> Signed-off-by: Alexandre Julliard <julliard@winehq.org>

vcomp: Implement C2VectParallel().
ab89486e · Paul Gofman · Alexandre Julliard · 5366f67e · ab89486e · ab89486e
Commit ab89486e authored Feb 12, 2021 by Paul Gofman Committed by Alexandre Julliard Feb 15, 2021
Showing with 161 additions and 3 deletions

main.c dlls/vcomp/main.c +158 -0

vcomp110.spec dlls/vcomp110/vcomp110.spec +1 -1

vcomp120.spec dlls/vcomp120/vcomp120.spec +1 -1

vcomp140.spec dlls/vcomp140/vcomp140.spec +1 -1

No files found.
--- a/dlls/vcomp/main.c
+++ b/dlls/vcomp/main.c
@@ -33,6 +33,8 @@
 WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
+#define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
 typedef CRITICAL_SECTION *omp_lock_t;
 typedef CRITICAL_SECTION *omp_nest_lock_t;
@@ -122,6 +124,14 @@ static void **ptr_from_va_list(__ms_va_list valist)
    return *(void ***)&valist;
 }
+static void copy_va_list_data(void **args, __ms_va_list valist, int args_count)
+{
+    unsigned int i;
+    for (i = 0; i < args_count; ++i)
+        args[i] = va_arg(valist, void *);
+}
 #if defined(__i386__)
 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
@@ -1665,6 +1675,154 @@ void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
    LeaveCriticalSection(critsect);
 }
+static unsigned int get_step_count(int start, int end, int range_offset, int step)
+{
+    int range = end - start + step - range_offset;
+    if (step < 0)
+        return (unsigned)-range / -step;
+    else
+        return (unsigned)range / step;
+}
+static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution,
+        int volatile *dynamic_start, void *function, int nargs, __ms_va_list valist)
+{
+    void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
+    unsigned int step_count, steps_per_call, remainder;
+    int thread_count = omp_get_num_threads();
+    int curr_start, curr_end, range_offset;
+    int thread = _vcomp_get_thread_num();
+    int step_sign;
+    copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
+    step_sign = step > 0 ? 1 : -1;
+    range_offset = step_sign * !end_included;
+    if (dynamic_distribution)
+    {
+        int next_start, new_start, end_value;
+        start = *dynamic_start;
+        end_value = end + !!end_included * step;
+        while (start != end_value)
+        {
+            step_count = get_step_count(start, end, range_offset, step);
+            curr_end = start + (step_count + thread_count - 1) / thread_count * step
+                    + range_offset;
+            if ((curr_end - end) * step_sign > 0)
+            {
+                next_start = end_value;
+                curr_end = end;
+            }
+            else
+            {
+                next_start = curr_end - range_offset;
+                curr_end -= step;
+            }
+            if ((new_start = InterlockedCompareExchange(dynamic_start, next_start, start)) != start)
+            {
+                start = new_start;
+                continue;
+            }
+            wrapper_args[0] = (void *)(ULONG_PTR)start;
+            wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
+            _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
+            start = *dynamic_start;
+        }
+        return;
+    }
+    step_count = get_step_count(start, end, range_offset, step);
+    /* According to the tests native vcomp still makes extra calls
+     * with empty range from excessive threads under certain conditions
+     * for unclear reason. */
+    if (thread >= step_count && (end_included || (step != 1 && step != -1)))
+        return;
+    steps_per_call = step_count / thread_count;
+    remainder = step_count % thread_count;
+    if (thread < remainder)
+    {
+        curr_start = thread * (steps_per_call + 1);
+        curr_end = curr_start + steps_per_call + 1;
+    }
+    else if (thread < step_count)
+    {
+        curr_start = remainder + steps_per_call * thread;
+        curr_end = curr_start + steps_per_call;
+    }
+    else
+    {
+        curr_start = curr_end = 0;
+    }
+    curr_start = start + curr_start * step;
+    curr_end = start + (curr_end - 1) * step + range_offset;
+    wrapper_args[0] = (void *)(ULONG_PTR)curr_start;
+    wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
+    _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
+}
+void CDECL C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count,
+        BOOL dynamic_distribution, void *function, int nargs, ...)
+{
+    struct vcomp_thread_data *thread_data;
+    int volatile dynamic_start;
+    int prev_thread_count;
+    __ms_va_list valist;
+    TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
+            " function %p, nargs %d.\n", start, end, step, end_included, thread_count,
+            dynamic_distribution, function, nargs);
+    if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS)
+    {
+        FIXME("Number of arguments %u exceeds supported maximum %u"
+                " (not calling the loop code, expect problems).\n",
+                nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS);
+        return;
+    }
+    __ms_va_start(valist, nargs);
+    /* This expression can result in integer overflow. According to the tests,
+     * native vcomp runs the function as a single thread both for empty range
+     * and (end - start) not fitting the integer range. */
+    if ((step > 0 && end < start) || (step < 0 && end > start)
+            || (end - start) / step < 2 || thread_count < 0)
+    {
+        void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
+        wrapper_args[0] = (void *)(ULONG_PTR)start;
+        wrapper_args[1] = (void *)(ULONG_PTR)end;
+        copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
+        _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
+        __ms_va_end(valist);
+        return;
+    }
+    thread_data = vcomp_init_thread_data();
+    prev_thread_count = thread_data->fork_threads;
+    thread_data->fork_threads = thread_count;
+    dynamic_start = start;
+    _vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution,
+            &dynamic_start, function, nargs, valist);
+    thread_data->fork_threads = prev_thread_count;
+    __ms_va_end(valist);
+}
 BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
 {
    TRACE("(%p, %d, %p)\n", instance, reason, reserved);

--- a/dlls/vcomp110/vcomp110.spec
+++ b/dlls/vcomp110/vcomp110.spec
-@ stub C2VectParallel
+@ varargs C2VectParallel(long long long long long long ptr long)
 @ cdecl _vcomp_atomic_add_i1(ptr long)
 @ cdecl _vcomp_atomic_add_i2(ptr long)
 @ cdecl _vcomp_atomic_add_i4(ptr long)

--- a/dlls/vcomp120/vcomp120.spec
+++ b/dlls/vcomp120/vcomp120.spec
-@ stub C2VectParallel
+@ varargs C2VectParallel(long long long long long long ptr long)
 @ cdecl _vcomp_atomic_add_i1(ptr long)
 @ cdecl _vcomp_atomic_add_i2(ptr long)
 @ cdecl _vcomp_atomic_add_i4(ptr long)

--- a/dlls/vcomp140/vcomp140.spec
+++ b/dlls/vcomp140/vcomp140.spec
-@ stub C2VectParallel
+@ varargs C2VectParallel(long long long long long long ptr long)
 @ cdecl _vcomp_atomic_add_i1(ptr long)
 @ cdecl _vcomp_atomic_add_i2(ptr long)
 @ cdecl _vcomp_atomic_add_i4(ptr long)