diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:13:25 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2023-07-07 05:13:25 +0000 |
commit | 31557ec760365acee4625de6e25b57d5d382b3bf (patch) | |
tree | 10d072d3cc9f927a72050c5420635bde93dfa25a | |
parent | fe9f7930146e6ad2645522d5c90d218a616adc3f (diff) | |
parent | 6b9ff13286194c7a2b38d624eeee38dc35987dc8 (diff) | |
download | OpenCL-CTS-android14-mainline-uwb-release.tar.gz |
Snap for 10453563 from 6b9ff13286194c7a2b38d624eeee38dc35987dc8 to mainline-uwb-releaseaml_uwb_341513070aml_uwb_341511050aml_uwb_341310300aml_uwb_341310030aml_uwb_341111010aml_uwb_341011000android14-mainline-uwb-release
Change-Id: Ic2583ca37fe8118671784cb9f028d161e35bbf09
346 files changed, 35047 insertions, 21226 deletions
diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index ea010778..00000000 --- a/.appveyor.yml +++ /dev/null @@ -1,54 +0,0 @@ -os: - - Visual Studio 2017 - -shallow_clone: true - -platform: - - Win32 - - x64 - -configuration: - - Release - -environment: - matrix: - - SETARCH: i686 - - SETARCH: x86_64 - -matrix: - exclude: - - platform: Win32 - SETARCH: x86_64 - - platform: x64 - SETARCH: i686 - -before_build: - # Setup environment: - - ps: $env:TOP = $env:APPVEYOR_BUILD_FOLDER - - ps: $env:TOP - - echo %TOP% - # Get the OpenCL Headers: - - git clone --depth=1 https://github.com/KhronosGroup/OpenCL-Headers OpenCL-Headers - # Get and build the OpenCL ICD Loader: - - git clone --depth=1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git - - ps: cd OpenCL-ICD-Loader - - ps: mkdir build - - ps: cd build - - cmake -A%PLATFORM% -DENABLE_OPENCL30_PROVISIONAL=1 -DOPENCL_ICD_LOADER_HEADERS_DIR=%TOP%/OpenCL-Headers/ .. - - cmake --build . --config %CONFIGURATION% - - ps: cd $env:TOP - # Get the libclcxx standard library: - - git clone --depth=1 https://github.com/KhronosGroup/libclcxx.git libclcxx - # Generate the CTS solution file: - - cmake -DCL_INCLUDE_DIR=%TOP%/OpenCL-Headers - -DCL_LIB_DIR=%TOP%/OpenCL-ICD-Loader/build - -DCL_LIBCLCXX_DIR=%TOP%/libclcxx - -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=./bin - -DOPENCL_LIBRARIES="OpenCL" - -H. -Bbuild_win -A%PLATFORM% - -DD3D10_IS_SUPPORTED=ON -DD3D11_IS_SUPPORTED=ON -DARCH=%SETARCH% - -build: - project: build_win\CLConform.sln - parallel: true - verbosity: normal diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 0c1778eb..1dfdb963 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -3,37 +3,62 @@ on: [push, pull_request] jobs: build: - name: Build ${{ matrix.os }} ${{ matrix.name }} + name: Build ${{ matrix.os }} ${{ matrix.arch }}${{ matrix.extra }} runs-on: ${{ matrix.os }} env: JOB_ARCHITECTURE: ${{ matrix.arch }} JOB_ENABLE_GL: ${{ matrix.gl }} + JOB_ENABLE_DEBUG: ${{ matrix.debug }} strategy: + fail-fast: false matrix: mainmatrix: [true] - os: [ubuntu-20.04, macos-11.0] + os: [ubuntu-20.04, macos-latest, windows-latest] include: - os: ubuntu-20.04 mainmatrix: true gl: 1 + extra: " gl" - os: ubuntu-20.04 mainmatrix: false - name: Arm arch: arm - os: ubuntu-20.04 mainmatrix: false - name: AArch64 arch: aarch64 + debug: 1 + extra: " debug" steps: - uses: actions/checkout@v2 + - name: Setup Ninja + uses: seanmiddleditch/gha-setup-ninja@master + - name: Setup OpenGL build dependencies + if: ${{ matrix.gl }} + run: | + sudo apt-get update + sudo apt-get -y install libglu1-mesa-dev freeglut3-dev mesa-common-dev libglew-dev + - name: Setup MSVC with Ninja + uses: ilammy/msvc-dev-cmd@v1 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + variant: sccache + key: ${{ matrix.os }}-${{ matrix.arch }} + - name: Fetch OpenCL Headers + shell: bash + run: | + git clone https://github.com/KhronosGroup/OpenCL-Headers.git + cd OpenCL-Headers + ln -s CL OpenCL # For OSX builds + cd .. - name: Build + shell: bash run: ./presubmit.sh formatcheck: name: Check code format runs-on: ubuntu-20.04 steps: - name: Install packages - run: sudo apt install -y clang-format + run: sudo apt install -y clang-format clang-format-9 - uses: actions/checkout@v2 with: fetch-depth: 0 @@ -1,24 +1,7 @@ -// *** THIS PACKAGE HAS SPECIAL LICENSING CONDITIONS. PLEASE -// CONSULT THE OWNERS AND opensource-licensing@google.com BEFORE -// DEPENDING ON IT IN YOUR PROJECT. *** package { default_applicable_licenses: ["external_OpenCL-CTS_license"], } -// Added automatically by a large-scale-change that took the approach of -// 'apply every license found to every target'. While this makes sure we respect -// every license restriction, it may not be entirely correct. -// -// e.g. GPL in an MIT project might only apply to the contrib/ directory. -// -// Please consider splitting the single license below into multiple licenses, -// taking care not to lose any license_kind information, and overriding the -// default license using the 'licenses: [...]' property on targets as needed. -// -// For unused files, consider creating a 'fileGroup' with "//visibility:private" -// to attach the license to, and including a comment whether the files may be -// used in the current project. -// See: http://go/android-license-faq license { name: "external_OpenCL-CTS_license", visibility: [":__subpackages__"], @@ -27,9 +10,6 @@ license { "SPDX-license-identifier-BSD", "SPDX-license-identifier-MIT", "SPDX-license-identifier-Unlicense", - "legacy_by_exception_only", // by exception only - "legacy_proprietary", // by exception only - "legacy_unencumbered", ], license_text: [ "LICENSE.txt", @@ -40,8 +20,8 @@ cc_library_headers { name: "ocl-harness-headers", export_include_dirs: [ "test_common/harness", - "test_common" - ] + "test_common", + ], } cc_defaults { @@ -56,54 +36,36 @@ cc_defaults { "-DCL_EXPERIMENTAL", "-DCL_TARGET_OPENCL_VERSION=300", "-Wno-#warnings", - "-Wno-absolute-value", - "-Wno-asm-operand-widths", "-Wno-c++11-narrowing", - "-Wno-dangling-else", "-Wno-date-time", "-Wno-deprecated-declarations", "-Wno-format", - "-Wno-ignored-pragmas", "-Wno-ignored-qualifiers", "-Wno-implicit-fallthrough", - "-Wno-logical-op-parentheses", - "-Wno-macro-redefined", "-Wno-missing-braces", - "-Wno-missing-declarations", "-Wno-missing-field-initializers", "-Wno-non-virtual-dtor", "-Wno-overloaded-virtual", - "-Wno-parentheses", - "-Wno-parentheses-equality", "-Wno-reorder-ctor", - "-Wno-return-stack-address", - "-Wno-shift-negative-value", "-Wno-sometimes-uninitialized", - "-Wno-switch", - "-Wno-unknown-pragmas", - "-Wno-unneeded-internal-declaration", - "-Wno-unused-function", - "-Wno-unused-label", "-Wno-unused-parameter", - "-Wno-unused-variable", - "-Wno-writable-strings", "-fexceptions", ], static_libs: [ - "ocl-stubs" + "ocl-stubs", ], } cc_library { name: "ocl-harness", - srcs: [ "test_common/harness/*.cpp", ], - defaults: [ "ocl-harness-defaults" ], + srcs: ["test_common/harness/*.cpp"], + defaults: ["ocl-harness-defaults"], } cc_defaults { name: "ocl-test-defaults", - defaults: [ "ocl-harness-defaults" ], - static_libs: [ "ocl-harness" ], + defaults: ["ocl-harness-defaults"], + static_libs: ["ocl-harness"], compile_multilib: "64", multilib: { lib64: { @@ -114,398 +76,366 @@ cc_defaults { cc_defaults { name: "ocl-test-image-defaults", - srcs: [ "test_conformance/images/common.cpp" ], - export_include_dirs: [ "test_conformance/images" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/images/common.cpp"], + export_include_dirs: ["test_conformance/images"], + defaults: ["ocl-test-defaults"], } - cc_test { name: "ocl-test-allocations", - srcs: [ "test_conformance/allocations/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/allocations/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-api", - srcs: [ "test_conformance/api/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/api/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-atomics", - srcs: [ "test_conformance/atomics/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/atomics/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-basic", - srcs: [ "test_conformance/basic/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/basic/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-buffers", - srcs: [ "test_conformance/buffers/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/buffers/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-c11-atomics", - srcs: [ "test_conformance/c11_atomics/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/c11_atomics/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-commonfns", - srcs: [ "test_conformance/commonfns/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/commonfns/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-compiler", - srcs: [ "test_conformance/compiler/*.cpp" ], - data: [ "test_conformance/compiler/includeTestDirectory/testIncludeFile.h", "test_conformance/compiler/secondIncludeTestDirectory/testIncludeFile.h" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/compiler/*.cpp"], + data: [ + "test_conformance/compiler/includeTestDirectory/testIncludeFile.h", + "test_conformance/compiler/secondIncludeTestDirectory/testIncludeFile.h", + ], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-computeinfo", - srcs: [ "test_conformance/computeinfo/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/computeinfo/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-contractions", - srcs: [ "test_conformance/contractions/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/contractions/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-conversions", - srcs: [ "test_conformance/conversions/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/conversions/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-device-execution", - srcs: [ "test_conformance/device_execution/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/device_execution/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-device-partition", - srcs: [ "test_conformance/device_partition/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/device_partition/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-device-timer", - srcs: [ "test_conformance/device_timer/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/device_timer/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-events", - srcs: [ "test_conformance/events/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/events/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-generic-address-space", - srcs: [ "test_conformance/generic_address_space/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/generic_address_space/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-geometrics", - srcs: [ "test_conformance/geometrics/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/geometrics/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-half", - srcs: [ "test_conformance/half/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/half/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-integer-ops", - srcs: [ "test_conformance/integer_ops/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/integer_ops/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-math-brute-force", - srcs: [ "test_conformance/math_brute_force/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/math_brute_force/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-mem-host-flags", - srcs: [ "test_conformance/mem_host_flags/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/mem_host_flags/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-multiple-device-context", - srcs: [ "test_conformance/multiple_device_context/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/multiple_device_context/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-non-uniform-work-group", - srcs: [ "test_conformance/non_uniform_work_group/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/non_uniform_work_group/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-pipes", - srcs: [ "test_conformance/pipes/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/pipes/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-printf", - srcs: [ "test_conformance/printf/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/printf/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-profiling", - srcs: [ "test_conformance/profiling/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/profiling/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-relationals", - srcs: [ "test_conformance/relationals/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/relationals/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-select", - srcs: [ "test_conformance/select/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/select/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-spir", - srcs: [ "test_conformance/spir/*.cpp", "test_conformance/math_brute_force/function_list.cpp", "test_common/miniz/miniz.c" ], - data: [ "test_conformance/spir/*.zip" ], - cflags: [ "-DFUNCTION_LIST_ULPS_ONLY", "-Wno-unused-private-field" ], - defaults: [ "ocl-test-defaults" ], + srcs: [ + "test_conformance/spir/*.cpp", + "test_conformance/math_brute_force/function_list.cpp", + "test_common/miniz/miniz.c", + ], + data: ["test_conformance/spir/*.zip"], + cflags: [ + "-DFUNCTION_LIST_ULPS_ONLY", + "-Wno-unused-private-field", + ], + defaults: ["ocl-test-defaults"], rtti: true, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-spirv-new", - srcs: [ "test_conformance/spirv_new/*.cpp", "test_conformance/math_brute_force/reference_math.cpp", "test_conformance/math_brute_force/utility.cpp" ], - data: [ "test_conformance/spirv_new/spirv_asm/*", "test_conformance/spirv_new/spirv_bin/*" ], - defaults: [ "ocl-test-defaults" ], + srcs: [ + "test_conformance/spirv_new/*.cpp", + "test_conformance/math_brute_force/reference_math.cpp", + "test_conformance/math_brute_force/utility.cpp", + ], + data: [ + "test_conformance/spirv_new/spirv_asm/*", + "test_conformance/spirv_new/spirv_bin/*", + ], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-subgroups", - srcs: [ "test_conformance/subgroups/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/subgroups/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-svm", - srcs: [ "test_conformance/SVM/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/SVM/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-thread-dimensions", - srcs: [ "test_conformance/thread_dimensions/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/thread_dimensions/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-vectors", - srcs: [ "test_conformance/vectors/*.cpp" ], - defaults: [ "ocl-test-defaults" ], + srcs: ["test_conformance/vectors/*.cpp"], + defaults: ["ocl-test-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-clcopyimage", - srcs: [ "test_conformance/images/clCopyImage/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/clCopyImage/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-clfillimage", - srcs: [ "test_conformance/images/clFillImage/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/clFillImage/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-clgetinfo", - srcs: [ "test_conformance/images/clGetInfo/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/clGetInfo/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-clreadwriteimage", - srcs: [ "test_conformance/images/clReadWriteImage/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/clReadWriteImage/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-kernel-image-methods", - srcs: [ "test_conformance/images/kernel_image_methods/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/kernel_image_methods/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-kernel-read-write", - srcs: [ "test_conformance/images/kernel_read_write/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/kernel_read_write/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } - cc_test { name: "ocl-test-image-samplerlessreads", - srcs: [ "test_conformance/images/samplerlessReads/*.cpp" ], - defaults: [ "ocl-test-image-defaults" ], + srcs: ["test_conformance/images/samplerlessReads/*.cpp"], + defaults: ["ocl-test-image-defaults"], rtti: false, - gtest: false + gtest: false, } python_test_host { name: "opencl_cts", main: "scripts/test_opencl_cts.py", - srcs: [ "scripts/test_opencl_cts.py" ], - data: [ "scripts/test_opencl_cts.xml" ], + srcs: ["scripts/test_opencl_cts.py"], + data: ["scripts/test_opencl_cts.xml"], test_config: "scripts/test_opencl_cts.xml", - version: { - py2: { - enabled: false, - }, - py3: { - enabled: true - } - }, test_options: { unit_test: false, }, @@ -514,15 +444,5 @@ python_test_host { python_test { name: "run_conformance", main: "test_conformance/run_conformance.py", - srcs: [ "test_conformance/run_conformance.py" ], - version: { - py2: { - enabled: true, - embedded_launcher: true, - }, - py3: { - enabled: false, - } - }, + srcs: ["test_conformance/run_conformance.py"], } - diff --git a/CMakeLists.txt b/CMakeLists.txt index 083ea96d..6a25d5b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,12 +10,6 @@ set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) -if(CMAKE_BUILD_TYPE STREQUAL "release") - set (BUILD_FLAVOR "release") -else(CMAKE_BUILD_TYPE STREQUAL "release") - set (BUILD_FLAVOR "debug") -endif(CMAKE_BUILD_TYPE STREQUAL "release") - add_definitions(-DCL_TARGET_OPENCL_VERSION=300) add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_2_APIS=1) add_definitions(-DCL_USE_DEPRECATED_OPENCL_2_1_APIS=1) @@ -29,14 +23,6 @@ if(USE_CL_EXPERIMENTAL) add_definitions(-DCL_EXPERIMENTAL) endif(USE_CL_EXPERIMENTAL) -# Support both VS2008 and VS2012. -set(BUILD_DIR "$ENV{ADRENO_DRIVER}/build") -if(MSVC90) - set(VS_BUILD_DIR "${BUILD_DIR}/vs2008") -else(MSVC110) - set(VS_BUILD_DIR "${BUILD_DIR}/vs2012") -endif(MSVC90) - #----------------------------------------------------------- # Default Configurable Test Set #----------------------------------------------------------- @@ -102,14 +88,14 @@ macro(add_cxx_flag_if_supported flag) endmacro(add_cxx_flag_if_supported) if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?Clang") + add_cxx_flag_if_supported(-Wmisleading-indentation) + add_cxx_flag_if_supported(-Wunused-variable) add_cxx_flag_if_supported(-Wno-narrowing) add_cxx_flag_if_supported(-Wno-format) add_cxx_flag_if_supported(-Werror) add_cxx_flag_if_supported(-Wno-error=cpp) # Allow #warning directive - add_cxx_flag_if_supported(-Wno-error=absolute-value) # Issue 783 add_cxx_flag_if_supported(-Wno-error=unknown-pragmas) # Issue #785 add_cxx_flag_if_supported(-Wno-error=asm-operand-widths) # Issue #784 - add_cxx_flag_if_supported(-Wno-error=overflow) # Fixed by #699 # -msse -mfpmath=sse to force gcc to use sse for float math, # avoiding excess precision problems that cause tests like int2float @@ -127,9 +113,24 @@ else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D__SSE__") endif() +# Set a module's COMPILE_FLAGS if using gcc or clang. +macro(set_gnulike_module_compile_flags flags) + if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?Clang") + SET_SOURCE_FILES_PROPERTIES( + ${${MODULE_NAME}_SOURCES} + PROPERTIES + COMPILE_FLAGS ${flags} + ) + endif() +endmacro(set_gnulike_module_compile_flags) + if(MSVC) # Don't warn when using standard non-secure functions. add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + # Don't warn about using the portable "strdup" function. + add_compile_definitions(_CRT_NONSTDC_NO_DEPRECATE) + # Fix std::min and std::max handling with windows.harness. + add_compile_definitions(NOMINMAX) endif() if( WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" ) @@ -152,10 +153,6 @@ if(LINK_PTHREAD) list(APPEND CLConform_LIBRARIES pthread) endif() -if(DEFINED USE_GLES3) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGLES3") -endif() - if(APPLE) find_library(corefoundation CoreFoundation) find_library(iokit IOKit) @@ -169,38 +166,5 @@ include_directories(${CLConform_SOURCE_DIR}/test_common/harness ${CLConform_SOURCE_DIR}/test_common/gl ${CLConform_SOURCE_DIR}/test_common) -if(CMAKE_BUILD_TYPE STREQUAL "release") - set (BUILD_FLAVOR "release") -elseif (CMAKE_BUILD_TYPE STREQUAL "debug") - set (BUILD_FLAVOR "debug") -endif(CMAKE_BUILD_TYPE STREQUAL "release") - - add_subdirectory(test_common) add_subdirectory(test_conformance) - -# Support both VS2008 and VS2012. -set (DLL_FILES "${VS_BUILD_DIR}/Debug/*.dll") -set (DST_DIR "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/Debug/") - -if (WIN32) - set (COPY "echo") - add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX} ALL - COMMAND ${COPY} "${DLL_FILES}" "${DST_DIR}" - COMMENT "Copying dll files.. ") -else (WIN32) - set (COPY cp) - add_custom_target(COPY_DLL${CONFORMANCE_SUFFIX}) -endif(WIN32) - -set_property(TARGET COPY_DLL${CONFORMANCE_SUFFIX} PROPERTY FOLDER "CONFORMANCE${CONFORMANCE_SUFFIX}") - -if(WIN32) - add_custom_target( COPY_FILES${CONFORMANCE_SUFFIX} ALL - COMMAND ${COPY} ${DLL_FILES} ${DST_DIR} - COMMENT "Copying other files to output folder..." ) -else(WIN32) - add_custom_target( COPY_FILES${CONFORMANCE_SUFFIX} ) -endif(WIN32) - -set_property(TARGET COPY_FILES${CONFORMANCE_SUFFIX} PROPERTY FOLDER "CONFORMANCE${CONFORMANCE_SUFFIX}") diff --git a/LICENSE b/LICENSE new file mode 120000 index 00000000..85de3d45 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +LICENSE.txt
\ No newline at end of file @@ -1,7 +1,19 @@ -# *** THIS PACKAGE HAS SPECIAL LICENSING CONDITIONS. PLEASE -# CONSULT THE OWNERS AND opensource-licensing@google.com BEFORE -# DEPENDING ON IT IN YOUR PROJECT. *** +# This project was upgraded with external_updater. +# Usage: tools/external_updater/updater.sh update OpenCL-CTS +# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md + +name: "OpenCL-CTS" +description: "OpenCL Conformance Tests" third_party { - license_note: "Khronos proprietary" - license_type: BY_EXCEPTION_ONLY + url { + type: GIT + value: "https://github.com/KhronosGroup/OpenCL-CTS.git" + } + version: "90a5183ec499d5b4701f58f6134dd424d82c4dca" + license_type: NOTICE + last_upgrade_date { + year: 2022 + month: 10 + day: 26 + } } diff --git a/MODULE_LICENSE_APACHE2 b/MODULE_LICENSE_APACHE2 new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/MODULE_LICENSE_APACHE2 @@ -1,2 +1,115 @@ -# OpenCL-CTS [![Build Status](https://api.travis-ci.org/KhronosGroup/OpenCL-CTS.svg?branch=master)](https://travis-ci.org/KhronosGroup/OpenCL-CTS/branches) -The OpenCL Conformance Tests +# OpenCL Conformance Test Suite (CTS) + +This it the OpenCL CTS for all versions of the Khronos +[OpenCL](https://www.khronos.org/opencl/) standard. + +## Building the CTS + +The CTS supports Linux, Windows, macOS, and Android platforms. In particular, +GitHub Actions CI builds against Ubuntu 20.04, Windows-latest, and +macos-latest. + +Compiling the CTS requires the following CMake configuration options to be set: + +* `CL_INCLUDE_DIR` Points to the unified + [OpenCL-Headers](https://github.com/KhronosGroup/OpenCL-Headers). +* `CL_LIB_DIR` Directory containing the OpenCL library to build against. +* `OPENCL_LIBRARIES` Name of the OpenCL library to link. + +It is advised that the [OpenCL ICD-Loader](https://github.com/KhronosGroup/OpenCL-ICD-Loader) +is used as the OpenCL library to build against. Where `CL_LIB_DIR` points to a +build of the ICD loader and `OPENCL_LIBRARIES` is "OpenCL". + +### Example Build + +Steps on a Linux platform to clone dependencies from GitHub sources, configure +a build, and compile. + +```sh +git clone https://github.com/KhronosGroup/OpenCL-CTS.git +git clone https://github.com/KhronosGroup/OpenCL-Headers.git +git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git + +mkdir OpenCL-ICD-Loader/build +cmake -S OpenCL-ICD-Loader -B OpenCL-ICD-Loader/build \ + -DOPENCL_ICD_LOADER_HEADERS_DIR=$PWD/OpenCL-Headers +cmake --build ./OpenCL-ICD-Loader/build --config Release + +mkdir OpenCL-CTS/build +cmake -S OpenCL-CTS -B OpenCL-CTS/build \ + -DCL_INCLUDE_DIR=$PWD/OpenCL-Headers \ + -DCL_LIB_DIR=$PWD/OpenCL-ICD-Loader/build \ + -DOPENCL_LIBRARIES=OpenCL +cmake --build OpenCL-CTS/build --config Release +``` + +## Running the CTS + +A build of the CTS contains multiple executables representing the directories in +the `test_conformance` folder. Each of these executables contains sub-tests, and +possibly smaller granularities of testing within the sub-tests. + +See the `--help` output on each executable for the list of sub-tests available, +as well as other options for configuring execution. + +If the OpenCL library built against is the ICD Loader, and the vendor library to +be tested is not registered in the +[default ICD Loader location](https://github.com/KhronosGroup/OpenCL-ICD-Loader#registering-icds) +then the [OCL_ICD_FILENAMES](https://github.com/KhronosGroup/OpenCL-ICD-Loader#table-of-debug-environment-variables) +environment variable will need to be set for the ICD Loader to detect the OpenCL +library to use at runtime. For example, to run the basic tests on a Linux +platform: + +```sh +OCL_ICD_FILENAMES=/path/to/vendor_lib.so ./test_basic +``` + +### Offline Compilation + +Testing OpenCL drivers which do not have a runtime compiler can be done by using +additional command line arguments provided by the test harness for tests which +require compilation, these are: + +* `--compilation-mode` Selects if OpenCL-C source code should be compiled using + an external tool before being passed on to the OpenCL driver in that form for + testing. Online is the default mode, but also accepts the values `spir-v`, and + `binary`. + +* `--compilation-cache-mode` Controls how the compiled OpenCL-C source code + should be cached on disk. + +* `--compilation-cache-path` Accepts a path to a directory where the compiled + binary cache should be stored on disk. + +* `--compilation-program` Accepts a path to an executable (default: + cl_offline_compiler) invoked by the test harness to perform offline + compilation of OpenCL-C source code. This executable must match the + [interface description](test_common/harness/cl_offline_compiler-interface.txt). + +## Generating a Conformance Report + +The Khronos [Conformance Process Document](https://members.khronos.org/document/dl/911) +details the steps required for a conformance submissions. +In this repository [opencl_conformance_tests_full.csv](test_conformance/submission_details_template.txt) +defines the full list of tests which must be run for conformance. The output log +of which must be included alongside a filled in +[submission details template](test_conformance/submission_details_template.txt). + +Utility script [run_conformance.py](test_conformance/run_conformance.py) can be +used to help generating the submission log, although it is not required. + +Git [tags](https://github.com/KhronosGroup/OpenCL-CTS/tags) are used to define +the version of the repository conformance submissions are made against. + +## Contributing + +Contributions are welcome to the project from Khronos members and non-members +alike via GitHub Pull Requests (PR). Alternatively, if you've found a bug or have +a questions please file an issue in the GitHub project. First time contributors +will be required to sign the Khronos Contributor License Agreement (CLA) before +their PR can be merged. + +PRs to the repository are required to be `clang-format` clean to pass CI. +Developers can either use the `git-clang-format` tool locally to verify this +before contributing, or update their PR based on the diff provided by a failing +CI job. diff --git a/check-format.sh b/check-format.sh index 7de2bd2c..be8f9d78 100755 --- a/check-format.sh +++ b/check-format.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -# Arg used to specify non-'origin/master' comparison branch -ORIGIN_BRANCH=${1:-"origin/master"} +# Arg used to specify non-'origin/main' comparison branch +ORIGIN_BRANCH=${1:-"origin/main"} CLANG_BINARY=${2:-"`which clang-format-9`"} # Run git-clang-format to check for violations diff --git a/dependencies/Android.bp b/dependencies/Android.bp index a8dbeeea..e521ca8e 100644 --- a/dependencies/Android.bp +++ b/dependencies/Android.bp @@ -33,7 +33,6 @@ genrule { "ocl-headers/CL/cl_gl.h", "ocl-headers/CL/cl_egl.h", "ocl-headers/CL/cl_ext.h", - "ocl-headers/CL/cl_gl_ext.h", ], cmd: "python3 $(location) $(in) > $(out)" } diff --git a/dependencies/ocl-headers/CL/cl.h b/dependencies/ocl-headers/CL/cl.h index 0018a0f4..6c700ab1 100644 --- a/dependencies/ocl-headers/CL/cl.h +++ b/dependencies/ocl-headers/CL/cl.h @@ -141,6 +141,10 @@ typedef struct _cl_image_desc { #pragma warning( push ) #pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 builds */ #endif +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wc11-extensions" /* Prevents warning about nameless union being C11 extension*/ +#endif #if defined(_MSC_VER) && defined(__STDC__) /* Anonymous unions are not supported in /Za builds */ #else @@ -158,6 +162,9 @@ typedef struct _cl_image_desc { #if defined(_MSC_VER) && !defined(__STDC__) #pragma warning( pop ) #endif +#ifdef __clang__ +#pragma clang diagnostic pop +#endif #endif } cl_image_desc; diff --git a/dependencies/ocl-headers/CL/cl_ext.h b/dependencies/ocl-headers/CL/cl_ext.h index 80a81dea..3eba7ed1 100644 --- a/dependencies/ocl-headers/CL/cl_ext.h +++ b/dependencies/ocl-headers/CL/cl_ext.h @@ -26,6 +26,494 @@ extern "C" { #include <CL/cl.h> +/*************************************************************** +* cl_khr_command_buffer +***************************************************************/ +#define cl_khr_command_buffer 1 +#define CL_KHR_COMMAND_BUFFER_EXTENSION_NAME \ + "cl_khr_command_buffer" + +typedef cl_bitfield cl_device_command_buffer_capabilities_khr; +typedef struct _cl_command_buffer_khr* cl_command_buffer_khr; +typedef cl_uint cl_sync_point_khr; +typedef cl_uint cl_command_buffer_info_khr; +typedef cl_uint cl_command_buffer_state_khr; +typedef cl_properties cl_command_buffer_properties_khr; +typedef cl_bitfield cl_command_buffer_flags_khr; +typedef cl_properties cl_ndrange_kernel_command_properties_khr; +typedef struct _cl_mutable_command_khr* cl_mutable_command_khr; + +/* cl_device_info */ +#define CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR 0x12A9 +#define CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR 0x12AA + +/* cl_device_command_buffer_capabilities_khr - bitfield */ +#define CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR (1 << 0) +#define CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR (1 << 1) +#define CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR (1 << 2) +#define CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR (1 << 3) + +/* cl_command_buffer_properties_khr */ +#define CL_COMMAND_BUFFER_FLAGS_KHR 0x1293 + +/* cl_command_buffer_flags_khr */ +#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR (1 << 0) + +/* Error codes */ +#define CL_INVALID_COMMAND_BUFFER_KHR -1138 +#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR -1139 +#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR -1140 + +/* cl_command_buffer_info_khr */ +#define CL_COMMAND_BUFFER_QUEUES_KHR 0x1294 +#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR 0x1295 +#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR 0x1296 +#define CL_COMMAND_BUFFER_STATE_KHR 0x1297 +#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR 0x1298 + +/* cl_command_buffer_state_khr */ +#define CL_COMMAND_BUFFER_STATE_RECORDING_KHR 0 +#define CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR 1 +#define CL_COMMAND_BUFFER_STATE_PENDING_KHR 2 +#define CL_COMMAND_BUFFER_STATE_INVALID_KHR 3 + +/* cl_command_type */ +#define CL_COMMAND_COMMAND_BUFFER_KHR 0x12A8 + + +typedef cl_command_buffer_khr (CL_API_CALL * +clCreateCommandBufferKHR_fn)( + cl_uint num_queues, + const cl_command_queue* queues, + const cl_command_buffer_properties_khr* properties, + cl_int* errcode_ret) ; + +typedef cl_int (CL_API_CALL * +clFinalizeCommandBufferKHR_fn)( + cl_command_buffer_khr command_buffer) ; + +typedef cl_int (CL_API_CALL * +clRetainCommandBufferKHR_fn)( + cl_command_buffer_khr command_buffer) ; + +typedef cl_int (CL_API_CALL * +clReleaseCommandBufferKHR_fn)( + cl_command_buffer_khr command_buffer) ; + +typedef cl_int (CL_API_CALL * +clEnqueueCommandBufferKHR_fn)( + cl_uint num_queues, + cl_command_queue* queues, + cl_command_buffer_khr command_buffer, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +typedef cl_int (CL_API_CALL * +clCommandBarrierWithWaitListKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandCopyBufferKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + size_t src_offset, + size_t dst_offset, + size_t size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandCopyBufferRectKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + const size_t* src_origin, + const size_t* dst_origin, + const size_t* region, + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandCopyBufferToImageKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_image, + size_t src_offset, + const size_t* dst_origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandCopyImageKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_image, + const size_t* src_origin, + const size_t* dst_origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandCopyImageToBufferKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_buffer, + const size_t* src_origin, + const size_t* region, + size_t dst_offset, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandFillBufferKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem buffer, + const void* pattern, + size_t pattern_size, + size_t offset, + size_t size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandFillImageKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem image, + const void* fill_color, + const size_t* origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clCommandNDRangeKernelKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + const cl_ndrange_kernel_command_properties_khr* properties, + cl_kernel kernel, + cl_uint work_dim, + const size_t* global_work_offset, + const size_t* global_work_size, + const size_t* local_work_size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +typedef cl_int (CL_API_CALL * +clGetCommandBufferInfoKHR_fn)( + cl_command_buffer_khr command_buffer, + cl_command_buffer_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; + +#ifndef CL_NO_PROTOTYPES + +extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL +clCreateCommandBufferKHR( + cl_uint num_queues, + const cl_command_queue* queues, + const cl_command_buffer_properties_khr* properties, + cl_int* errcode_ret) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinalizeCommandBufferKHR( + cl_command_buffer_khr command_buffer) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandBufferKHR( + cl_command_buffer_khr command_buffer) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandBufferKHR( + cl_command_buffer_khr command_buffer) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCommandBufferKHR( + cl_uint num_queues, + cl_command_queue* queues, + cl_command_buffer_khr command_buffer, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandBarrierWithWaitListKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandCopyBufferKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + size_t src_offset, + size_t dst_offset, + size_t size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandCopyBufferRectKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + const size_t* src_origin, + const size_t* dst_origin, + const size_t* region, + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandCopyBufferToImageKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_image, + size_t src_offset, + const size_t* dst_origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandCopyImageKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_image, + const size_t* src_origin, + const size_t* dst_origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandCopyImageToBufferKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_buffer, + const size_t* src_origin, + const size_t* region, + size_t dst_offset, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandFillBufferKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem buffer, + const void* pattern, + size_t pattern_size, + size_t offset, + size_t size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandFillImageKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + cl_mem image, + const void* fill_color, + const size_t* origin, + const size_t* region, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCommandNDRangeKernelKHR( + cl_command_buffer_khr command_buffer, + cl_command_queue command_queue, + const cl_ndrange_kernel_command_properties_khr* properties, + cl_kernel kernel, + cl_uint work_dim, + const size_t* global_work_offset, + const size_t* global_work_size, + const size_t* local_work_size, + cl_uint num_sync_points_in_wait_list, + const cl_sync_point_khr* sync_point_wait_list, + cl_sync_point_khr* sync_point, + cl_mutable_command_khr* mutable_handle) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandBufferInfoKHR( + cl_command_buffer_khr command_buffer, + cl_command_buffer_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; + +#endif /* CL_NO_PROTOTYPES */ + +/*************************************************************** +* cl_khr_command_buffer_mutable_dispatch +***************************************************************/ +#define cl_khr_command_buffer_mutable_dispatch 1 +#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME \ + "cl_khr_command_buffer_mutable_dispatch" + +typedef cl_uint cl_command_buffer_structure_type_khr; +typedef cl_bitfield cl_mutable_dispatch_fields_khr; +typedef cl_uint cl_mutable_command_info_khr; +typedef struct _cl_mutable_dispatch_arg_khr { + cl_uint arg_index; + size_t arg_size; + const void* arg_value; +} cl_mutable_dispatch_arg_khr; +typedef struct _cl_mutable_dispatch_exec_info_khr { + cl_uint param_name; + size_t param_value_size; + const void* param_value; +} cl_mutable_dispatch_exec_info_khr; +typedef struct _cl_mutable_dispatch_config_khr { + cl_command_buffer_structure_type_khr type; + const void* next; + cl_mutable_command_khr command; + cl_uint num_args; + cl_uint num_svm_args; + cl_uint num_exec_infos; + cl_uint work_dim; + const cl_mutable_dispatch_arg_khr* arg_list; + const cl_mutable_dispatch_arg_khr* arg_svm_list; + const cl_mutable_dispatch_exec_info_khr* exec_info_list; + const size_t* global_work_offset; + const size_t* global_work_size; + const size_t* local_work_size; +} cl_mutable_dispatch_config_khr; +typedef struct _cl_mutable_base_config_khr { + cl_command_buffer_structure_type_khr type; + const void* next; + cl_uint num_mutable_dispatch; + const cl_mutable_dispatch_config_khr* mutable_dispatch_list; +} cl_mutable_base_config_khr; + +/* cl_command_buffer_flags_khr - bitfield */ +#define CL_COMMAND_BUFFER_MUTABLE_KHR (1 << 1) + +/* Error codes */ +#define CL_INVALID_MUTABLE_COMMAND_KHR -1141 + +/* cl_device_info */ +#define CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR 0x12B0 + +/* cl_ndrange_kernel_command_properties_khr */ +#define CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR 0x12B1 + +/* cl_mutable_dispatch_fields_khr - bitfield */ +#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR (1 << 0) +#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR (1 << 1) +#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR (1 << 2) +#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR (1 << 3) +#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR (1 << 4) + +/* cl_mutable_command_info_khr */ +#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR 0x12A0 +#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR 0x12A1 +#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR 0x12AD +#define CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR 0x12A2 +#define CL_MUTABLE_DISPATCH_KERNEL_KHR 0x12A3 +#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR 0x12A4 +#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR 0x12A5 +#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR 0x12A6 +#define CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR 0x12A7 + +/* cl_command_buffer_structure_type_khr */ +#define CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR 0 +#define CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR 1 + + +typedef cl_int (CL_API_CALL * +clUpdateMutableCommandsKHR_fn)( + cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr* mutable_config) ; + +typedef cl_int (CL_API_CALL * +clGetMutableCommandInfoKHR_fn)( + cl_mutable_command_khr command, + cl_mutable_command_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; + +#ifndef CL_NO_PROTOTYPES + +extern CL_API_ENTRY cl_int CL_API_CALL +clUpdateMutableCommandsKHR( + cl_command_buffer_khr command_buffer, + const cl_mutable_base_config_khr* mutable_config) ; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMutableCommandInfoKHR( + cl_mutable_command_khr command, + cl_mutable_command_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; + +#endif /* CL_NO_PROTOTYPES */ + /* cl_khr_fp64 extension - no extension #define since it has no functions */ /* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ @@ -734,6 +1222,321 @@ clGetKernelSuggestedLocalWorkSizeKHR_fn)( size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0; +/*************************************************************** +* cl_khr_integer_dot_product +***************************************************************/ +#define cl_khr_integer_dot_product 1 + +typedef cl_bitfield cl_device_integer_dot_product_capabilities_khr; + +/* cl_device_integer_dot_product_capabilities_khr */ +#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0) +#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR (1 << 1) + +typedef struct _cl_device_integer_dot_product_acceleration_properties_khr { + cl_bool signed_accelerated; + cl_bool unsigned_accelerated; + cl_bool mixed_signedness_accelerated; + cl_bool accumulating_saturating_signed_accelerated; + cl_bool accumulating_saturating_unsigned_accelerated; + cl_bool accumulating_saturating_mixed_signedness_accelerated; +} cl_device_integer_dot_product_acceleration_properties_khr; + +/* cl_device_info */ +#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR 0x1073 +#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR 0x1074 +#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR 0x1075 + + +/*************************************************************** +* cl_khr_external_memory +***************************************************************/ +#define cl_khr_external_memory 1 + +typedef cl_uint cl_external_memory_handle_type_khr; + +/* cl_platform_info */ +#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044 + +/* cl_device_info */ +#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x204F + +/* cl_mem_properties */ +#define CL_DEVICE_HANDLE_LIST_KHR 0x2051 +#define CL_DEVICE_HANDLE_LIST_END_KHR 0 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR 0x2047 +#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR 0x2048 + + +typedef cl_int (CL_API_CALL * +clEnqueueAcquireExternalMemObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem* mem_objects, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_3_0; + +typedef cl_int (CL_API_CALL * +clEnqueueReleaseExternalMemObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem* mem_objects, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_3_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireExternalMemObjectsKHR( + cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem* mem_objects, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_3_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseExternalMemObjectsKHR( + cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem* mem_objects, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_3_0; + +/*************************************************************** +* cl_khr_external_memory_dma_buf +***************************************************************/ +#define cl_khr_external_memory_dma_buf 1 + +/* cl_external_memory_handle_type_khr */ +#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR 0x2067 + +/*************************************************************** +* cl_khr_external_memory_dx +***************************************************************/ +#define cl_khr_external_memory_dx 1 + +/* cl_external_memory_handle_type_khr */ +#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KHR 0x2063 +#define CL_EXTERNAL_MEMORY_HANDLE_D3D11_TEXTURE_KMT_KHR 0x2064 +#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_HEAP_KHR 0x2065 +#define CL_EXTERNAL_MEMORY_HANDLE_D3D12_RESOURCE_KHR 0x2066 + +/*************************************************************** +* cl_khr_external_memory_opaque_fd +***************************************************************/ +#define cl_khr_external_memory_opaque_fd 1 + +/* cl_external_memory_handle_type_khr */ +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR 0x2060 + +/*************************************************************** +* cl_khr_external_memory_win32 +***************************************************************/ +#define cl_khr_external_memory_win32 1 + +/* cl_external_memory_handle_type_khr */ +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR 0x2061 +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2062 + +/*************************************************************** +* cl_khr_external_semaphore +***************************************************************/ +#define cl_khr_external_semaphore 1 + +typedef struct _cl_semaphore_khr * cl_semaphore_khr; +typedef cl_uint cl_external_semaphore_handle_type_khr; + +/* cl_platform_info */ +#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x2037 +#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x2038 + +/* cl_device_info */ +#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x204D +#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x204E + +/* cl_semaphore_properties_khr */ +#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x203F +#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR 0 + + +typedef cl_int (CL_API_CALL * +clGetSemaphoreHandleForTypeKHR_fn)( + cl_semaphore_khr sema_object, + cl_device_id device, + cl_external_semaphore_handle_type_khr handle_type, + size_t handle_size, + void* handle_ptr, + size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSemaphoreHandleForTypeKHR( + cl_semaphore_khr sema_object, + cl_device_id device, + cl_external_semaphore_handle_type_khr handle_type, + size_t handle_size, + void* handle_ptr, + size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2; + +/*************************************************************** +* cl_khr_external_semaphore_dx_fence +***************************************************************/ +#define cl_khr_external_semaphore_dx_fence 1 + +/* cl_external_semaphore_handle_type_khr */ +#define CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR 0x2059 + +/*************************************************************** +* cl_khr_external_semaphore_opaque_fd +***************************************************************/ +#define cl_khr_external_semaphore_opaque_fd 1 + +/* cl_external_semaphore_handle_type_khr */ +#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR 0x2055 + +/*************************************************************** +* cl_khr_external_semaphore_sync_fd +***************************************************************/ +#define cl_khr_external_semaphore_sync_fd 1 + +/* cl_external_semaphore_handle_type_khr */ +#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR 0x2058 + +/*************************************************************** +* cl_khr_external_semaphore_win32 +***************************************************************/ +#define cl_khr_external_semaphore_win32 1 + +/* cl_external_semaphore_handle_type_khr */ +#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR 0x2056 +#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2057 + +/*************************************************************** +* cl_khr_semaphore +***************************************************************/ +#define cl_khr_semaphore 1 + +/* type cl_semaphore_khr */ +typedef cl_properties cl_semaphore_properties_khr; +typedef cl_uint cl_semaphore_info_khr; +typedef cl_uint cl_semaphore_type_khr; +typedef cl_ulong cl_semaphore_payload_khr; + +/* cl_semaphore_type */ +#define CL_SEMAPHORE_TYPE_BINARY_KHR 1 + +/* cl_platform_info */ +#define CL_PLATFORM_SEMAPHORE_TYPES_KHR 0x2036 + +/* cl_device_info */ +#define CL_DEVICE_SEMAPHORE_TYPES_KHR 0x204C + +/* cl_semaphore_info_khr */ +#define CL_SEMAPHORE_CONTEXT_KHR 0x2039 +#define CL_SEMAPHORE_REFERENCE_COUNT_KHR 0x203A +#define CL_SEMAPHORE_PROPERTIES_KHR 0x203B +#define CL_SEMAPHORE_PAYLOAD_KHR 0x203C + +/* cl_semaphore_info_khr or cl_semaphore_properties_khr */ +#define CL_SEMAPHORE_TYPE_KHR 0x203D +/* enum CL_DEVICE_HANDLE_LIST_KHR */ +/* enum CL_DEVICE_HANDLE_LIST_END_KHR */ + +/* cl_command_type */ +#define CL_COMMAND_SEMAPHORE_WAIT_KHR 0x2042 +#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR 0x2043 + +/* Error codes */ +#define CL_INVALID_SEMAPHORE_KHR -1142 + + +typedef cl_semaphore_khr (CL_API_CALL * +clCreateSemaphoreWithPropertiesKHR_fn)( + cl_context context, + const cl_semaphore_properties_khr* sema_props, + cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef cl_int (CL_API_CALL * +clEnqueueWaitSemaphoresKHR_fn)( + cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr* sema_objects, + const cl_semaphore_payload_khr* sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_2; + +typedef cl_int (CL_API_CALL * +clEnqueueSignalSemaphoresKHR_fn)( + cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr* sema_objects, + const cl_semaphore_payload_khr* sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_2; + +typedef cl_int (CL_API_CALL * +clGetSemaphoreInfoKHR_fn)( + cl_semaphore_khr sema_object, + cl_semaphore_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef cl_int (CL_API_CALL * +clReleaseSemaphoreKHR_fn)( + cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +typedef cl_int (CL_API_CALL * +clRetainSemaphoreKHR_fn)( + cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL +clCreateSemaphoreWithPropertiesKHR( + cl_context context, + const cl_semaphore_properties_khr* sema_props, + cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWaitSemaphoresKHR( + cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr* sema_objects, + const cl_semaphore_payload_khr* sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSignalSemaphoresKHR( + cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr* sema_objects, + const cl_semaphore_payload_khr* sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSemaphoreInfoKHR( + cl_semaphore_khr sema_object, + cl_semaphore_info_khr param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSemaphoreKHR( + cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSemaphoreKHR( + cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + /********************************** * cl_arm_import_memory extension * **********************************/ @@ -941,12 +1744,20 @@ typedef cl_bitfield cl_device_scheduling_controls_capabilities_arm; #define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2) #define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM (1 << 3) #define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM (1 << 4) +#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM (1 << 5) +#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_BATCH_QUEUE_SIZE_ARM (1 << 6) #define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM 0x41EB +#define CL_DEVICE_MAX_WARP_COUNT_ARM 0x41EA /* cl_kernel_info */ +#define CL_KERNEL_MAX_WARP_COUNT_ARM 0x41E9 + +/* cl_kernel_exec_info */ #define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM 0x41E5 #define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6 +#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM 0x41E8 +#define CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM 0x41F1 /* cl_queue_properties */ #define CL_QUEUE_KERNEL_BATCHING_ARM 0x41E7 @@ -982,14 +1793,43 @@ typedef cl_uint cl_command_termination_reason_arm; #define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM 2 #define CL_COMMAND_TERMINATION_ERROR_ARM 3 -/*************************************** -* cl_intel_thread_local_exec extension * -****************************************/ +/************************************* +* cl_arm_protected_memory_allocation * +*************************************/ + +#define cl_arm_protected_memory_allocation 1 + +#define CL_MEM_PROTECTED_ALLOC_ARM (1ULL << 36) -#define cl_intel_thread_local_exec 1 +/****************************************** +* cl_intel_exec_by_local_thread extension * +******************************************/ + +#define cl_intel_exec_by_local_thread 1 #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) +/*************************************************************** +* cl_intel_device_attribute_query +***************************************************************/ + +#define cl_intel_device_attribute_query 1 + +typedef cl_bitfield cl_device_feature_capabilities_intel; + +/* cl_device_feature_capabilities_intel */ +#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL (1 << 0) +#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL (1 << 1) + +/* cl_device_info */ +#define CL_DEVICE_IP_VERSION_INTEL 0x4250 +#define CL_DEVICE_ID_INTEL 0x4251 +#define CL_DEVICE_NUM_SLICES_INTEL 0x4252 +#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253 +#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254 +#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL 0x4255 +#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL 0x4256 + /*********************************************** * cl_intel_device_partition_by_names extension * ************************************************/ @@ -1342,57 +2182,47 @@ typedef cl_uint cl_diagnostics_verbose_level; /******************************************* * cl_intel_unified_shared_memory extension * ********************************************/ - -/* These APIs are in sync with Revision Q of the cl_intel_unified_shared_memory spec! */ - #define cl_intel_unified_shared_memory 1 -/* cl_device_info */ -#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 -#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 -#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 -#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 -#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 +typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; +typedef cl_properties cl_mem_properties_intel; +typedef cl_bitfield cl_mem_alloc_flags_intel; +typedef cl_uint cl_mem_info_intel; +typedef cl_uint cl_unified_shared_memory_type_intel; +typedef cl_uint cl_mem_advice_intel; -typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; +/* cl_device_info */ +#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 +#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 +#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 +#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 +#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 /* cl_device_unified_shared_memory_capabilities_intel - bitfield */ -#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) -#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) +#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) +#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) +#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3) -typedef cl_properties cl_mem_properties_intel; - /* cl_mem_properties_intel */ -#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 - -typedef cl_bitfield cl_mem_alloc_flags_intel; +#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 /* cl_mem_alloc_flags_intel - bitfield */ -#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) - -typedef cl_uint cl_mem_info_intel; +#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) +#define CL_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE_INTEL (1 << 1) +#define CL_MEM_ALLOC_INITIAL_PLACEMENT_HOST_INTEL (1 << 2) /* cl_mem_alloc_info_intel */ -#define CL_MEM_ALLOC_TYPE_INTEL 0x419A -#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B -#define CL_MEM_ALLOC_SIZE_INTEL 0x419C -#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D -/* Enum values 0x419E-0x419F are reserved for future queries. */ - -typedef cl_uint cl_unified_shared_memory_type_intel; +#define CL_MEM_ALLOC_TYPE_INTEL 0x419A +#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B +#define CL_MEM_ALLOC_SIZE_INTEL 0x419C +#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D /* cl_unified_shared_memory_type_intel */ -#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 -#define CL_MEM_TYPE_HOST_INTEL 0x4197 -#define CL_MEM_TYPE_DEVICE_INTEL 0x4198 -#define CL_MEM_TYPE_SHARED_INTEL 0x4199 - -typedef cl_uint cl_mem_advice_intel; - -/* cl_mem_advice_intel */ -/* Enum values 0x4208-0x420F are reserved for future memory advices. */ +#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 +#define CL_MEM_TYPE_HOST_INTEL 0x4197 +#define CL_MEM_TYPE_DEVICE_INTEL 0x4198 +#define CL_MEM_TYPE_SHARED_INTEL 0x4199 /* cl_kernel_exec_info */ #define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200 @@ -1401,223 +2231,249 @@ typedef cl_uint cl_mem_advice_intel; #define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203 /* cl_command_type */ -#define CL_COMMAND_MEMFILL_INTEL 0x4204 -#define CL_COMMAND_MEMCPY_INTEL 0x4205 -#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 -#define CL_COMMAND_MEMADVISE_INTEL 0x4207 +#define CL_COMMAND_MEMFILL_INTEL 0x4204 +#define CL_COMMAND_MEMCPY_INTEL 0x4205 +#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 +#define CL_COMMAND_MEMADVISE_INTEL 0x4207 -extern CL_API_ENTRY void* CL_API_CALL -clHostMemAllocINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); typedef void* (CL_API_CALL * clHostMemAllocINTEL_fn)( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY void* CL_API_CALL -clDeviceMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); + cl_context context, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; typedef void* (CL_API_CALL * clDeviceMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY void* CL_API_CALL -clSharedMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); + cl_context context, + cl_device_id device, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; typedef void* (CL_API_CALL * clSharedMemAllocINTEL_fn)( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL -clMemFreeINTEL( - cl_context context, - void* ptr); + cl_context context, + cl_device_id device, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; typedef cl_int (CL_API_CALL * clMemFreeINTEL_fn)( - cl_context context, - void* ptr); - -extern CL_API_ENTRY cl_int CL_API_CALL -clMemBlockingFreeINTEL( - cl_context context, - void* ptr); + cl_context context, + void* ptr) ; typedef cl_int (CL_API_CALL * clMemBlockingFreeINTEL_fn)( - cl_context context, - void* ptr); - -extern CL_API_ENTRY cl_int CL_API_CALL -clGetMemAllocInfoINTEL( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); + cl_context context, + void* ptr) ; typedef cl_int (CL_API_CALL * clGetMemAllocInfoINTEL_fn)( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); - -extern CL_API_ENTRY cl_int CL_API_CALL -clSetKernelArgMemPointerINTEL( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); + cl_context context, + const void* ptr, + cl_mem_info_intel param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; typedef cl_int (CL_API_CALL * clSetKernelArgMemPointerINTEL_fn)( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); + cl_kernel kernel, + cl_uint arg_index, + const void* arg_value) ; -extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemsetINTEL( - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +typedef cl_int (CL_API_CALL * +clEnqueueMemFillINTEL_fn)( + cl_command_queue command_queue, + void* dst_ptr, + const void* pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; typedef cl_int (CL_API_CALL * -clEnqueueMemsetINTEL_fn)( - cl_command_queue command_queue, - void* dst_ptr, - cl_int value, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clEnqueueMemcpyINTEL_fn)( + cl_command_queue command_queue, + cl_bool blocking, + void* dst_ptr, + const void* src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +typedef cl_int (CL_API_CALL * +clEnqueueMemAdviseINTEL_fn)( + cl_command_queue command_queue, + const void* ptr, + size_t size, + cl_mem_advice_intel advice, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +#ifndef CL_NO_PROTOTYPES + +extern CL_API_ENTRY void* CL_API_CALL +clHostMemAllocINTEL( + cl_context context, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; + +extern CL_API_ENTRY void* CL_API_CALL +clDeviceMemAllocINTEL( + cl_context context, + cl_device_id device, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; + +extern CL_API_ENTRY void* CL_API_CALL +clSharedMemAllocINTEL( + cl_context context, + cl_device_id device, + const cl_mem_properties_intel* properties, + size_t size, + cl_uint alignment, + cl_int* errcode_ret) ; extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemFillINTEL( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clMemFreeINTEL( + cl_context context, + void* ptr) ; -typedef cl_int (CL_API_CALL * -clEnqueueMemFillINTEL_fn)( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +extern CL_API_ENTRY cl_int CL_API_CALL +clMemBlockingFreeINTEL( + cl_context context, + void* ptr) ; extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemcpyINTEL( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clGetMemAllocInfoINTEL( + cl_context context, + const void* ptr, + cl_mem_info_intel param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) ; -typedef cl_int (CL_API_CALL * -clEnqueueMemcpyINTEL_fn)( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgMemPointerINTEL( + cl_kernel kernel, + cl_uint arg_index, + const void* arg_value) ; -#ifdef CL_VERSION_1_2 +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMemFillINTEL( + cl_command_queue command_queue, + void* dst_ptr, + const void* pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; -/* Because these APIs use cl_mem_migration_flags, they require - OpenCL 1.2: */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMemcpyINTEL( + cl_command_queue command_queue, + cl_bool blocking, + void* dst_ptr, + const void* src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMigrateMemINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clEnqueueMemAdviseINTEL( + cl_command_queue command_queue, + const void* ptr, + size_t size, + cl_mem_advice_intel advice, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +#endif /* CL_NO_PROTOTYPES */ + +#if defined(CL_VERSION_1_2) +/* Requires OpenCL 1.2 for cl_mem_migration_flags: */ typedef cl_int (CL_API_CALL * clEnqueueMigrateMemINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); + cl_command_queue command_queue, + const void* ptr, + size_t size, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; -#endif +#ifndef CL_NO_PROTOTYPES extern CL_API_ENTRY cl_int CL_API_CALL -clEnqueueMemAdviseINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clEnqueueMigrateMemINTEL( + cl_command_queue command_queue, + const void* ptr, + size_t size, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +#endif /* CL_NO_PROTOTYPES */ + +#endif /* defined(CL_VERSION_1_2) */ + +/* deprecated, use clEnqueueMemFillINTEL instead */ typedef cl_int (CL_API_CALL * -clEnqueueMemAdviseINTEL_fn)( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); +clEnqueueMemsetINTEL_fn)( + cl_command_queue command_queue, + void* dst_ptr, + cl_int value, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +#ifndef CL_NO_PROTOTYPES + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMemsetINTEL( + cl_command_queue command_queue, + void* dst_ptr, + cl_int value, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) ; + +#endif /* CL_NO_PROTOTYPES */ + +/*************************************************************** +* cl_intel_mem_alloc_buffer_location +***************************************************************/ +#define cl_intel_mem_alloc_buffer_location 1 +#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_NAME \ + "cl_intel_mem_alloc_buffer_location" + +/* cl_mem_properties_intel */ +#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL 0x419E + +/* cl_mem_alloc_info_intel */ +/* enum CL_MEM_ALLOC_BUFFER_LOCATION_INTEL */ /*************************************************** * cl_intel_create_buffer_with_properties extension * @@ -1700,6 +2556,76 @@ typedef struct _cl_queue_family_properties_intel { #define CL_QUEUE_CAPABILITY_BARRIER_INTEL (1 << 25) #define CL_QUEUE_CAPABILITY_KERNEL_INTEL (1 << 26) +/*************************************************************** +* cl_intel_queue_no_sync_operations +***************************************************************/ + +#define cl_intel_queue_no_sync_operations 1 + +/* addition to cl_command_queue_properties */ +#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL (1 << 29) + +/*************************************************************** +* cl_intel_sharing_format_query +***************************************************************/ +#define cl_intel_sharing_format_query 1 + +/*************************************************************** +* cl_ext_image_requirements_info +***************************************************************/ + +#ifdef CL_VERSION_3_0 + +#define cl_ext_image_requirements_info 1 + +typedef cl_uint cl_image_requirements_info_ext; + +#define CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT 0x1290 +#define CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT 0x1292 +#define CL_IMAGE_REQUIREMENTS_SIZE_EXT 0x12B2 +#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT 0x12B3 +#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT 0x12B4 +#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT 0x12B5 +#define CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT 0x12B6 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageRequirementsInfoEXT( + cl_context context, + const cl_mem_properties* properties, + cl_mem_flags flags, + const cl_image_format* image_format, + const cl_image_desc* image_desc, + cl_image_requirements_info_ext param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0; + +typedef cl_int (CL_API_CALL * +clGetImageRequirementsInfoEXT_fn)( + cl_context context, + const cl_mem_properties* properties, + cl_mem_flags flags, + const cl_image_format* image_format, + const cl_image_desc* image_desc, + cl_image_requirements_info_ext param_name, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0; + +#endif + +/*************************************************************** +* cl_ext_image_from_buffer +***************************************************************/ + +#ifdef CL_VERSION_3_0 + +#define cl_ext_image_from_buffer 1 + +#define CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT 0x1291 + +#endif + #ifdef __cplusplus } #endif diff --git a/dependencies/ocl-headers/CL/cl_gl.h b/dependencies/ocl-headers/CL/cl_gl.h index 5ea0fd8b..32774650 100644 --- a/dependencies/ocl-headers/CL/cl_gl.h +++ b/dependencies/ocl-headers/CL/cl_gl.h @@ -162,6 +162,31 @@ clCreateEventFromGLsyncKHR(cl_context context, cl_GLsync sync, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; +/*************************************************************** +* cl_intel_sharing_format_query_gl +***************************************************************/ +#define cl_intel_sharing_format_query_gl 1 + +/* when cl_khr_gl_sharing is supported */ + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedGLTextureFormatsINTEL( + cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_GLenum* gl_formats, + cl_uint* num_texture_formats) ; + +typedef cl_int (CL_API_CALL * +clGetSupportedGLTextureFormatsINTEL_fn)( + cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_GLenum* gl_formats, + cl_uint* num_texture_formats) ; + #ifdef __cplusplus } #endif diff --git a/dependencies/ocl-headers/CL/cl_gl_ext.h b/dependencies/ocl-headers/CL/cl_gl_ext.h deleted file mode 100644 index 8ec81816..00000000 --- a/dependencies/ocl-headers/CL/cl_gl_ext.h +++ /dev/null @@ -1,18 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2008-2021 The Khronos Group Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -#include <CL/cl_gl.h> -#pragma message("All OpenGL-related extensions have been moved into cl_gl.h. Please include cl_gl.h directly.") diff --git a/dependencies/ocl-headers/CL/cl_platform.h b/dependencies/ocl-headers/CL/cl_platform.h index 8ae655d1..e7a0d6f4 100644 --- a/dependencies/ocl-headers/CL/cl_platform.h +++ b/dependencies/ocl-headers/CL/cl_platform.h @@ -135,6 +135,11 @@ extern "C" { #if (defined (_WIN32) && defined(_MSC_VER)) +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wlanguage-extension-token" +#endif + /* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */ /* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */ #if defined(__clang__) || _MSC_VER >= 1600 @@ -155,6 +160,10 @@ typedef unsigned __int16 cl_half; typedef float cl_float; typedef double cl_double; +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 #define CL_SCHAR_MAX 127 @@ -501,25 +510,26 @@ typedef unsigned int cl_GLenum; #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L #define __CL_HAS_ANON_STRUCT__ 1 #define __CL_ANON_STRUCT__ -#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined(__clang__) #define __CL_HAS_ANON_STRUCT__ 1 #define __CL_ANON_STRUCT__ __extension__ -#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) - #if _MSC_VER >= 1500 - /* Microsoft Developer Studio 2008 supports anonymous structs, but - * complains by default. */ - #define __CL_HAS_ANON_STRUCT__ 1 - #define __CL_ANON_STRUCT__ - /* Disable warning C4201: nonstandard extension used : nameless - * struct/union */ - #pragma warning( push ) - #pragma warning( disable : 4201 ) - #endif #else #define __CL_HAS_ANON_STRUCT__ 0 #define __CL_ANON_STRUCT__ #endif +#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) +#endif + /* Define alignment keys */ #if defined( __GNUC__ ) || defined(__INTEGRITY) #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) @@ -1395,10 +1405,8 @@ typedef union } #endif -#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) - #if _MSC_VER >=1500 +#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__ #pragma warning( pop ) - #endif #endif #endif /* __CL_PLATFORM_H */ diff --git a/dependencies/ocl-stubs/apis_generator.py b/dependencies/ocl-stubs/apis_generator.py index 8cc09542..8cdbc403 100644 --- a/dependencies/ocl-stubs/apis_generator.py +++ b/dependencies/ocl-stubs/apis_generator.py @@ -80,7 +80,7 @@ def process_type(raw): def parse_api(api_signature): m = None - api_signature = re.sub('extern', '', api_signature) + api_signature = re.sub(r'\bextern\b', '', api_signature) api_signature = re.sub('CL_\w+', '', api_signature) m = re.match(r'\s*(.*)\s+(\w+)\((.*)\)\s*;', api_signature) diff --git a/dependencies/ocl-stubs/stubs.cpp b/dependencies/ocl-stubs/stubs.cpp index 2cf37001..fe9a9126 100644 --- a/dependencies/ocl-stubs/stubs.cpp +++ b/dependencies/ocl-stubs/stubs.cpp @@ -2,7 +2,6 @@ #include <CL/cl_gl.h> #include <CL/cl_egl.h> #include <CL/cl_ext.h> -#include <CL/cl_gl_ext.h> #include <dlfcn.h> @@ -31,4 +30,3 @@ rettype fname fargs { #define CL_MACRO FUNC_SYM #include "apis.h" #undef CL_MACRO - diff --git a/presubmit.sh b/presubmit.sh index 6fc037c8..ca39b9a2 100755 --- a/presubmit.sh +++ b/presubmit.sh @@ -14,8 +14,11 @@ TOOLCHAIN_FILE=${TOP}/toolchain.cmake touch ${TOOLCHAIN_FILE} BUILD_OPENGL_TEST="OFF" +cmake --version +echo + # Prepare toolchain if needed -if [[ ${JOB_ARCHITECTURE} != "" ]]; then +if [[ ${JOB_ARCHITECTURE} != "" && ${RUNNER_OS} != "Windows" ]]; then TOOLCHAIN_URL_VAR=TOOLCHAIN_URL_${JOB_ARCHITECTURE} TOOLCHAIN_URL=${!TOOLCHAIN_URL_VAR} wget ${TOOLCHAIN_URL} @@ -38,35 +41,67 @@ fi if [[ ( ${JOB_ARCHITECTURE} == "" && ${JOB_ENABLE_GL} == "1" ) ]]; then BUILD_OPENGL_TEST="ON" - sudo apt-get update - sudo apt-get -y install libglu1-mesa-dev freeglut3-dev mesa-common-dev libglew-dev fi -# Prepare headers -git clone https://github.com/KhronosGroup/OpenCL-Headers.git -cd OpenCL-Headers -ln -s CL OpenCL # For OSX builds -cd .. + +if [[ ${JOB_ENABLE_DEBUG} == 1 ]]; then + BUILD_CONFIG="Debug" +else + BUILD_CONFIG="Release" +fi + +#Vulkan Headers +git clone https://github.com/KhronosGroup/Vulkan-Headers.git # Get and build loader git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git cd ${TOP}/OpenCL-ICD-Loader mkdir build cd build -cmake -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} -DOPENCL_ICD_LOADER_HEADERS_DIR=${TOP}/OpenCL-Headers/ .. -make +cmake .. -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \ + -DOPENCL_ICD_LOADER_HEADERS_DIR=${TOP}/OpenCL-Headers/ +cmake --build . -j2 + +#Vulkan Loader +cd ${TOP} +git clone https://github.com/KhronosGroup/Vulkan-Loader.git +cd Vulkan-Loader +mkdir build +cd build +python3 ../scripts/update_deps.py +cmake .. -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \ + -DBUILD_WSI_XLIB_SUPPORT=OFF \ + -DBUILD_WSI_XCB_SUPPORT=OFF \ + -DBUILD_WSI_WAYLAND_SUPPORT=OFF \ + -DUSE_GAS=OFF \ + -C helper.cmake .. +cmake --build . -j2 # Build CTS cd ${TOP} ls -l mkdir build cd build -cmake -DCL_INCLUDE_DIR=${TOP}/OpenCL-Headers \ +if [[ ${RUNNER_OS} == "Windows" ]]; then + CMAKE_OPENCL_LIBRARIES_OPTION="OpenCL" + CMAKE_CACHE_OPTIONS="" +else + CMAKE_OPENCL_LIBRARIES_OPTION="-lOpenCL -lpthread" + CMAKE_CACHE_OPTIONS="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache" +fi +cmake .. -G Ninja \ + -DCMAKE_BUILD_TYPE="${BUILD_CONFIG}" \ + ${CMAKE_CACHE_OPTIONS} \ + -DCL_INCLUDE_DIR=${TOP}/OpenCL-Headers \ -DCL_LIB_DIR=${TOP}/OpenCL-ICD-Loader/build \ -DCMAKE_TOOLCHAIN_FILE=${TOOLCHAIN_FILE} \ -DCMAKE_RUNTIME_OUTPUT_DIRECTORY=./bin \ - -DOPENCL_LIBRARIES="-lOpenCL -lpthread" \ + -DOPENCL_LIBRARIES="${CMAKE_OPENCL_LIBRARIES_OPTION}" \ -DUSE_CL_EXPERIMENTAL=ON \ -DGL_IS_SUPPORTED=${BUILD_OPENGL_TEST} \ - .. -make -j2 - + -DVULKAN_INCLUDE_DIR=${TOP}/Vulkan-Headers/include/ \ + -DVULKAN_LIB_DIR=${TOP}/Vulkan-Loader/build/loader/ +cmake --build . -j3 diff --git a/scripts/android_bp_head b/scripts/android_bp_head index c5cd3949..42c3e2c7 100644 --- a/scripts/android_bp_head +++ b/scripts/android_bp_head @@ -1,24 +1,7 @@ -// *** THIS PACKAGE HAS SPECIAL LICENSING CONDITIONS. PLEASE -// CONSULT THE OWNERS AND opensource-licensing@google.com BEFORE -// DEPENDING ON IT IN YOUR PROJECT. *** package { default_applicable_licenses: ["external_OpenCL-CTS_license"], } -// Added automatically by a large-scale-change that took the approach of -// 'apply every license found to every target'. While this makes sure we respect -// every license restriction, it may not be entirely correct. -// -// e.g. GPL in an MIT project might only apply to the contrib/ directory. -// -// Please consider splitting the single license below into multiple licenses, -// taking care not to lose any license_kind information, and overriding the -// default license using the 'licenses: [...]' property on targets as needed. -// -// For unused files, consider creating a 'fileGroup' with "//visibility:private" -// to attach the license to, and including a comment whether the files may be -// used in the current project. -// See: http://go/android-license-faq license { name: "external_OpenCL-CTS_license", visibility: [":__subpackages__"], @@ -27,9 +10,6 @@ license { "SPDX-license-identifier-BSD", "SPDX-license-identifier-MIT", "SPDX-license-identifier-Unlicense", - "legacy_by_exception_only", // by exception only - "legacy_proprietary", // by exception only - "legacy_unencumbered", ], license_text: [ "LICENSE.txt", @@ -56,37 +36,19 @@ cc_defaults { "-DCL_EXPERIMENTAL", "-DCL_TARGET_OPENCL_VERSION=300", "-Wno-#warnings", - "-Wno-absolute-value", - "-Wno-asm-operand-widths", "-Wno-c++11-narrowing", - "-Wno-dangling-else", "-Wno-date-time", "-Wno-deprecated-declarations", "-Wno-format", - "-Wno-ignored-pragmas", "-Wno-ignored-qualifiers", "-Wno-implicit-fallthrough", - "-Wno-logical-op-parentheses", - "-Wno-macro-redefined", "-Wno-missing-braces", - "-Wno-missing-declarations", "-Wno-missing-field-initializers", "-Wno-non-virtual-dtor", "-Wno-overloaded-virtual", - "-Wno-parentheses", - "-Wno-parentheses-equality", "-Wno-reorder-ctor", - "-Wno-return-stack-address", - "-Wno-shift-negative-value", "-Wno-sometimes-uninitialized", - "-Wno-switch", - "-Wno-unknown-pragmas", - "-Wno-unneeded-internal-declaration", - "-Wno-unused-function", - "-Wno-unused-label", "-Wno-unused-parameter", - "-Wno-unused-variable", - "-Wno-writable-strings", "-fexceptions", ], static_libs: [ @@ -118,4 +80,3 @@ cc_defaults { export_include_dirs: [ "test_conformance/images" ], defaults: [ "ocl-test-defaults" ], } - diff --git a/scripts/android_bp_tail b/scripts/android_bp_tail index a073f337..c0488738 100644 --- a/scripts/android_bp_tail +++ b/scripts/android_bp_tail @@ -4,14 +4,6 @@ python_test_host { srcs: [ "scripts/test_opencl_cts.py" ], data: [ "scripts/test_opencl_cts.xml" ], test_config: "scripts/test_opencl_cts.xml", - version: { - py2: { - enabled: false, - }, - py3: { - enabled: true - } - }, test_options: { unit_test: false, }, @@ -21,14 +13,4 @@ python_test { name: "run_conformance", main: "test_conformance/run_conformance.py", srcs: [ "test_conformance/run_conformance.py" ], - version: { - py2: { - enabled: true, - embedded_launcher: true, - }, - py3: { - enabled: false, - } - }, } - diff --git a/scripts/generate_test_files.py b/scripts/generate_test_files.py index cdb10dbf..1155a0ce 100644 --- a/scripts/generate_test_files.py +++ b/scripts/generate_test_files.py @@ -1,6 +1,8 @@ import json import os import re +import shutil +import subprocess from xml.dom import minidom from xml.etree import ElementTree @@ -45,7 +47,8 @@ cc_test {{ f.write(cc_test_string) -def generate_android_bp(): +# Return value indicates whether the output should be formatted with bpfmt +def generate_android_bp() -> bool: android_bp_head_path = os.path.join(SCRIPT_DIR, 'android_bp_head') android_bp_tail_path = os.path.join(SCRIPT_DIR, 'android_bp_tail') @@ -61,6 +64,12 @@ def generate_android_bp(): with open(android_bp_tail_path, 'r') as android_bp_tail: android_bp.write(android_bp_tail.read()) + if shutil.which('bpfmt') is not None: + subprocess.run(['bpfmt', '-w', 'Android.bp']) + return True + + return False + def create_subelement_with_attribs(element, tag, attribs): subelement = ElementTree.SubElement(element, tag) @@ -142,12 +151,15 @@ def generate_test_xml(): def main(): - generate_android_bp() + android_bp_formatted = generate_android_bp() generate_test_xml() print("Don't forget to move -") print(" Android.bp -> {ANDROID_ROOT}/external/OpenCL-CTS/Android.bp") print(" test_opencl_cts.xml -> {ANDROID_ROOT}/external/OpenCL-CTS/scripts/test_opencl_cts.xml") + if not android_bp_formatted: + print("then run the blueprint autoformatter:") + print(" bpfmt -w {ANDROID_ROOT}/external/OpenCL-CTS/Android.bp") if __name__ == '__main__': diff --git a/test_common/CMakeLists.txt b/test_common/CMakeLists.txt index 2d4bc190..b0505345 100644 --- a/test_common/CMakeLists.txt +++ b/test_common/CMakeLists.txt @@ -1,6 +1,5 @@ set(HARNESS_SOURCES - harness/threadTesting.cpp harness/typeWrappers.cpp harness/mt19937.cpp harness/conversions.cpp @@ -22,4 +21,3 @@ set(HARNESS_SOURCES ) add_library(harness STATIC ${HARNESS_SOURCES}) - diff --git a/test_common/gl/helpers.cpp b/test_common/gl/helpers.cpp index def78d75..b9f95a94 100644 --- a/test_common/gl/helpers.cpp +++ b/test_common/gl/helpers.cpp @@ -1381,7 +1381,6 @@ void * CreateGLTexture2DArrayMultisample(size_t width, size_t height, //calculating colors double color_delta = 1.0 / (total_layers * samples); - double color = color_delta; if (attachment != GL_DEPTH_ATTACHMENT && attachment != GL_DEPTH_STENCIL_ATTACHMENT) { glDisable(GL_DEPTH_TEST); diff --git a/test_common/gl/setup_win32.cpp b/test_common/gl/setup_win32.cpp index b120a36d..708e681d 100644 --- a/test_common/gl/setup_win32.cpp +++ b/test_common/gl/setup_win32.cpp @@ -13,14 +13,11 @@ // See the License for the specific language governing permissions and // limitations under the License. // -#define GL_GLEXT_PROTOTYPES #include "setup.h" #include "testBase.h" #include "harness/errorHelpers.h" -#include <GL/gl.h> -#include <GL/glut.h> #include <CL/cl_ext.h> typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( diff --git a/test_common/gl/setup_x11.cpp b/test_common/gl/setup_x11.cpp index c54ecdec..abc065c9 100644 --- a/test_common/gl/setup_x11.cpp +++ b/test_common/gl/setup_x11.cpp @@ -13,16 +13,11 @@ // See the License for the specific language governing permissions and // limitations under the License. // -#define GL_GLEXT_PROTOTYPES #include "setup.h" #include "testBase.h" #include "harness/errorHelpers.h" -#include <GL/gl.h> -#include <GL/glut.h> -#include <GL/glext.h> -#include <GL/freeglut.h> #include <GL/glx.h> #include <CL/cl_ext.h> @@ -90,10 +85,17 @@ public: } for (int i=0; i<(int)num_of_devices; i++) { - if (!is_extension_available(devices[i], "cl_khr_gl_sharing ")) { - log_info("Device %d of %d does not support required extension cl_khr_gl_sharing.\n", i+1, num_of_devices); - } else { - log_info("Device %d of %d supports required extension cl_khr_gl_sharing.\n", i+1, num_of_devices); + if (!is_extension_available(devices[i], "cl_khr_gl_sharing")) + { + log_info("Device %d of %d does not support required extension " + "cl_khr_gl_sharing.\n", + i + 1, num_of_devices); + } + else + { + log_info("Device %d of %d supports required extension " + "cl_khr_gl_sharing.\n", + i + 1, num_of_devices); found_valid_device = 1; m_devices[m_device_count++] = devices[i]; } diff --git a/test_common/gles/helpers.cpp b/test_common/gles/helpers.cpp index 34f40b4c..57a4ddc1 100644 --- a/test_common/gles/helpers.cpp +++ b/test_common/gles/helpers.cpp @@ -22,7 +22,7 @@ {GLint __error = glGetError(); if(__error) {log_error( "GL ERROR: %s!\n", gluErrorString( err ));}} #if defined(__linux__) || defined(GL_ES_VERSION_2_0) -// On linux we dont link to GLU library to avoid comaptibility issues with +// On linux we don't link to GLU library to avoid compatibility issues with // libstdc++ // FIXME: Implement this const GLubyte* gluErrorString (GLenum error) @@ -271,8 +271,6 @@ void * ReadGLTexture( GLenum glTarget, GLuint glTexture, // Read results from the GL texture glBindTexture(get_base_gl_target(glTarget), glTexture); - GLint realWidth, realHeight; - GLint realInternalFormat; GLenum readBackFormat = GL_RGBA; GLenum readBackType = glType; glFramebufferWrapper glFramebuffer; @@ -301,7 +299,7 @@ void * ReadGLTexture( GLenum glTarget, GLuint glTexture, GetGLFormatName(readBackFormat), GetGLTypeName(readBackType)); - DumpGLBuffer(readBackType, realWidth, realHeight, (void*)outBuffer); + DumpGLBuffer(readBackType, outWidth, outHeight, (void *)outBuffer); #endif diff --git a/test_common/gles/helpers.h b/test_common/gles/helpers.h index 5bd0fdf1..20768787 100644 --- a/test_common/gles/helpers.h +++ b/test_common/gles/helpers.h @@ -30,11 +30,10 @@ #if !defined (__APPLE__) #include <CL/cl.h> -#include "gl_headers.h" #include <CL/cl_gl.h> -#else -#include "gl_headers.h" +#include <CL/cl_half.h> #endif +#include "gl_headers.h" #include "harness/errorHelpers.h" #include "harness/kernelHelpers.h" diff --git a/test_common/harness/ThreadPool.cpp b/test_common/harness/ThreadPool.cpp index 31985aa0..62798045 100644 --- a/test_common/harness/ThreadPool.cpp +++ b/test_common/harness/ThreadPool.cpp @@ -22,6 +22,8 @@ #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) // or any other POSIX system +#include <atomic> + #if defined(_WIN32) #include <windows.h> #if defined(_MSC_VER) @@ -241,7 +243,7 @@ pthread_cond_t cond_var; // Condition variable state. How many iterations on the function left to run, // set to CL_INT_MAX to cause worker threads to exit. Note: this value might // go negative. -volatile cl_int gRunCount = 0; +std::atomic<cl_int> gRunCount{ 0 }; // State that only changes when the threadpool is not working. volatile TPFuncPtr gFunc_ptr = NULL; @@ -261,19 +263,20 @@ pthread_cond_t caller_cond_var; // # of threads intended to be running. Running threads will decrement this // as they discover they've run out of work to do. -volatile cl_int gRunning = 0; +std::atomic<cl_int> gRunning{ 0 }; // The total number of threads launched. -volatile cl_int gThreadCount = 0; +std::atomic<cl_int> gThreadCount{ 0 }; + #ifdef _WIN32 void ThreadPool_WorkerFunc(void *p) #else void *ThreadPool_WorkerFunc(void *p) #endif { - cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1); - cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1); - // log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning ); + auto &tid = *static_cast<std::atomic<cl_uint> *>(p); + cl_uint threadID = tid++; + cl_int item = gRunCount--; while (MAX_COUNT > item) { @@ -282,8 +285,6 @@ void *ThreadPool_WorkerFunc(void *p) // check for more work to do if (0 >= item) { - // log_info("Thread %d has run out of work.\n", threadID); - // No work to do. Attempt to block waiting for work #if defined(_WIN32) EnterCriticalSection(cond_lock); @@ -298,9 +299,7 @@ void *ThreadPool_WorkerFunc(void *p) } #endif // !_WIN32 - cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1); - // log_info("ThreadPool_WorkerFunc: gRunning = %d\n", - // remaining - 1); + cl_int remaining = gRunning--; if (1 == remaining) { // last thread out signal the main thread to wake up #if defined(_WIN32) @@ -350,7 +349,7 @@ void *ThreadPool_WorkerFunc(void *p) #endif // !_WIN32 // try again to get a valid item id - item = ThreadPool_AtomicAdd(&gRunCount, -1); + item = gRunCount--; if (MAX_COUNT <= item) // exit if we are done { #if defined(_WIN32) @@ -362,8 +361,7 @@ void *ThreadPool_WorkerFunc(void *p) } } - ThreadPool_AtomicAdd(&gRunning, 1); - // log_info("Thread %d has found work.\n", threadID); + gRunning++; #if defined(_WIN32) LeaveCriticalSection(cond_lock); @@ -447,12 +445,12 @@ void *ThreadPool_WorkerFunc(void *p) } // get the next item - item = ThreadPool_AtomicAdd(&gRunCount, -1); + item = gRunCount--; } exit: log_info("ThreadPool: thread %d exiting.\n", threadID); - ThreadPool_AtomicAdd(&gThreadCount, -1); + gThreadCount--; #if !defined(_WIN32) return NULL; #endif @@ -487,7 +485,7 @@ void ThreadPool_Init(void) { cl_int i; int err; - volatile cl_uint threadID = 0; + std::atomic<cl_uint> threadID{ 0 }; // Check for manual override of multithreading code. We add this for better // debuggability. @@ -523,7 +521,7 @@ void ThreadPool_Init(void) { // Count the number of bits in ProcessorMask (number of // logical cores) - ULONG mask = ptr->ProcessorMask; + ULONG_PTR mask = ptr->ProcessorMask; while (mask) { ++gThreadCount; @@ -624,7 +622,7 @@ void ThreadPool_Init(void) } #endif // !_WIN32 - gRunning = gThreadCount; + gRunning = gThreadCount.load(); // init threads for (i = 0; i < gThreadCount; i++) { @@ -688,7 +686,6 @@ static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter, void ThreadPool_Exit(void) { - int err, count; gRunCount = CL_INT_MAX; #if defined(__GNUC__) @@ -702,13 +699,13 @@ void ThreadPool_Exit(void) #endif // spin waiting for threads to die - for (count = 0; 0 != gThreadCount && count < 1000; count++) + for (int count = 0; 0 != gThreadCount && count < 1000; count++) { #if defined(_WIN32) _WakeAllConditionVariable(cond_var); Sleep(1); #else // !_WIN32 - if ((err = pthread_cond_broadcast(&cond_var))) + if (int err = pthread_cond_broadcast(&cond_var)) { log_error("Error %d from pthread_cond_broadcast. Unable to wake up " "work threads. ThreadPool_Exit failed.\n", @@ -722,7 +719,7 @@ void ThreadPool_Exit(void) if (gThreadCount) log_error("Error: Thread pool timed out after 1 second with %d threads " "still active.\n", - gThreadCount); + gThreadCount.load()); else log_info("Thread pool exited in a orderly fashion.\n"); } @@ -738,7 +735,9 @@ void ThreadPool_Exit(void) // all available then it would make more sense to use those features. cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo) { +#ifndef _WIN32 cl_int newErr; +#endif cl_int err = 0; // Lazily set up our threads #if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600) @@ -913,7 +912,9 @@ cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo) err = jobError; +#ifndef _WIN32 exit: +#endif // exit critical region #if defined(_WIN32) LeaveCriticalSection(gThreadPoolLock); diff --git a/test_common/harness/alloc.h b/test_common/harness/alloc.h index 653dde05..3b00d7c9 100644 --- a/test_common/harness/alloc.h +++ b/test_common/harness/alloc.h @@ -29,7 +29,7 @@ #include "mingw_compat.h" #endif -static void* align_malloc(size_t size, size_t alignment) +inline void* align_malloc(size_t size, size_t alignment) { #if defined(_WIN32) && defined(_MSC_VER) return _aligned_malloc(size, alignment); @@ -53,7 +53,7 @@ static void* align_malloc(size_t size, size_t alignment) #endif } -static void align_free(void* ptr) +inline void align_free(void* ptr) { #if defined(_WIN32) && defined(_MSC_VER) _aligned_free(ptr); diff --git a/test_common/harness/compat.h b/test_common/harness/compat.h index 7aad15a0..4053b7ee 100644 --- a/test_common/harness/compat.h +++ b/test_common/harness/compat.h @@ -18,13 +18,13 @@ #if defined(_WIN32) && defined(_MSC_VER) #include <Windows.h> -#endif - +#else #ifdef __cplusplus #define EXTERN_C extern "C" #else #define EXTERN_C #endif +#endif // @@ -309,13 +309,6 @@ EXTERN_C int __builtin_clz(unsigned int pattern); #endif -#ifndef MIN -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) -#endif -#ifndef MAX -#define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#endif - /*----------------------------------------------------------------------------- WARNING: DO NOT USE THESE MACROS: diff --git a/test_common/harness/conversions.cpp b/test_common/harness/conversions.cpp index fc3317c7..d52a2ac6 100644 --- a/test_common/harness/conversions.cpp +++ b/test_common/harness/conversions.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "conversions.h" +#include <cinttypes> #include <limits.h> #include <time.h> #include <assert.h> @@ -50,10 +51,10 @@ void print_type_to_string(ExplicitType type, void *data, char *string) case kInt: sprintf(string, "%d", *((cl_int *)data)); return; case kUInt: case kUnsignedInt: sprintf(string, "%u", *((cl_uint *)data)); return; - case kLong: sprintf(string, "%lld", *((cl_long *)data)); return; + case kLong: sprintf(string, "%" PRId64 "", *((cl_long *)data)); return; case kULong: case kUnsignedLong: - sprintf(string, "%llu", *((cl_ulong *)data)); + sprintf(string, "%" PRIu64 "", *((cl_ulong *)data)); return; case kFloat: sprintf(string, "%f", *((cl_float *)data)); return; case kHalf: sprintf(string, "half"); return; @@ -181,8 +182,8 @@ static ULong sUpperLimits[kNumExplicitTypes] = { 0xffffffffLL, 0xffffffffLL, 0x7fffffffffffffffLL, - 0xffffffffffffffffLL, - 0xffffffffffffffffLL, + 0xffffffffffffffffULL, + 0xffffffffffffffffULL, 0, 0 }; // Last two values aren't stored here diff --git a/test_common/harness/deviceInfo.cpp b/test_common/harness/deviceInfo.cpp index 287a1423..97ab8c85 100644 --- a/test_common/harness/deviceInfo.cpp +++ b/test_common/harness/deviceInfo.cpp @@ -63,6 +63,40 @@ int is_extension_available(cl_device_id device, const char *extensionName) return false; } +cl_version get_extension_version(cl_device_id device, const char *extensionName) +{ + cl_int err; + size_t size; + + err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, 0, nullptr, + &size); + if (err != CL_SUCCESS) + { + throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_" + "VERSION) failed to return size\n"); + } + + std::vector<cl_name_version> extensions(size / sizeof(cl_name_version)); + err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS_WITH_VERSION, size, + extensions.data(), &size); + if (err != CL_SUCCESS) + { + throw std::runtime_error("clGetDeviceInfo(CL_DEVICE_EXTENSIONS_WITH_" + "VERSION) failed to return value\n"); + } + + for (auto &ext : extensions) + { + if (!strcmp(extensionName, ext.name)) + { + return ext.version; + } + } + + throw std::runtime_error("Extension " + std::string(extensionName) + + " not supported by device!"); +} + /* Returns a string containing the supported extensions list for a device. */ std::string get_device_extensions_string(cl_device_id device) { diff --git a/test_common/harness/deviceInfo.h b/test_common/harness/deviceInfo.h index f8c55805..912dd198 100644 --- a/test_common/harness/deviceInfo.h +++ b/test_common/harness/deviceInfo.h @@ -31,6 +31,11 @@ std::string get_device_info_string(cl_device_id device, /* Determines if an extension is supported by a device. */ int is_extension_available(cl_device_id device, const char *extensionName); +/* Returns the version of the extension the device supports or throws an + * exception if the extension is not supported by the device. */ +cl_version get_extension_version(cl_device_id device, + const char *extensionName); + /* Returns a string containing the supported extensions list for a device. */ std::string get_device_extensions_string(cl_device_id device); diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp index 22a2677d..eaccf641 100644 --- a/test_common/harness/errorHelpers.cpp +++ b/test_common/harness/errorHelpers.cpp @@ -18,9 +18,12 @@ #include <stdlib.h> #include <string.h> +#include <algorithm> + #include "errorHelpers.h" #include "parseParameters.h" +#include "testHarness.h" #include <CL/cl_half.h> @@ -300,10 +303,6 @@ const char *GetQueuePropertyName(cl_command_queue_properties property) } } -#ifndef MAX -#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b)) -#endif - #if defined(_MSC_VER) #define scalbnf(_a, _i) ldexpf(_a, _i) #define scalbn(_a, _i) ldexp(_a, _i) @@ -356,7 +355,7 @@ static float Ulp_Error_Half_Float(float test, double reference) // The unbiased exponent of the ulp unit place int ulp_exp = - HALF_MANT_DIG - 1 - MAX(ilogb(reference), HALF_MIN_EXP - 1); + HALF_MANT_DIG - 1 - std::max(ilogb(reference), HALF_MIN_EXP - 1); // Scale the exponent of the error return (float)scalbn(testVal - reference, ulp_exp); @@ -364,7 +363,7 @@ static float Ulp_Error_Half_Float(float test, double reference) // reference is a normal power of two or a zero int ulp_exp = - HALF_MANT_DIG - 1 - MAX(ilogb(reference) - 1, HALF_MIN_EXP - 1); + HALF_MANT_DIG - 1 - std::max(ilogb(reference) - 1, HALF_MIN_EXP - 1); // Scale the exponent of the error return (float)scalbn(testVal - reference, ulp_exp); @@ -436,7 +435,8 @@ float Ulp_Error(float test, double reference) return 0.0f; // if we are expecting a NaN, any NaN is fine // The unbiased exponent of the ulp unit place - int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference), FLT_MIN_EXP - 1); + int ulp_exp = + FLT_MANT_DIG - 1 - std::max(ilogb(reference), FLT_MIN_EXP - 1); // Scale the exponent of the error return (float)scalbn(testVal - reference, ulp_exp); @@ -444,7 +444,8 @@ float Ulp_Error(float test, double reference) // reference is a normal power of two or a zero // The unbiased exponent of the ulp unit place - int ulp_exp = FLT_MANT_DIG - 1 - MAX(ilogb(reference) - 1, FLT_MIN_EXP - 1); + int ulp_exp = + FLT_MANT_DIG - 1 - std::max(ilogb(reference) - 1, FLT_MIN_EXP - 1); // Scale the exponent of the error return (float)scalbn(testVal - reference, ulp_exp); @@ -512,7 +513,7 @@ float Ulp_Error_Double(double test, long double reference) // The unbiased exponent of the ulp unit place int ulp_exp = - DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1); + DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1); // Scale the exponent of the error float result = (float)scalbnl(testVal - reference, ulp_exp); @@ -528,7 +529,7 @@ float Ulp_Error_Double(double test, long double reference) // reference is a normal power of two or a zero // The unbiased exponent of the ulp unit place int ulp_exp = - DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1); + DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1); // Scale the exponent of the error float result = (float)scalbnl(testVal - reference, ulp_exp); @@ -564,7 +565,7 @@ cl_int OutputBuildLogs(cl_program program, cl_uint num_devices, error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size_ret); test_error(error, "Unable to query context's device size"); - num_devices = size_ret / sizeof(cl_device_id); + num_devices = static_cast<cl_uint>(size_ret / sizeof(cl_device_id)); device_list = (cl_device_id *)malloc(size_ret); if (device_list == NULL) { @@ -690,21 +691,19 @@ const char *subtests_to_skip_with_offline_compiler[] = { "library_function" }; -int check_functions_for_offline_compiler(const char *subtestname, - cl_device_id device) +bool check_functions_for_offline_compiler(const char *subtestname) { if (gCompilationMode != kOnline) { - int nNotRequiredWithOfflineCompiler = - sizeof(subtests_to_skip_with_offline_compiler) / sizeof(char *); - size_t i; - for (i = 0; i < nNotRequiredWithOfflineCompiler; ++i) + size_t nNotRequiredWithOfflineCompiler = + ARRAY_SIZE(subtests_to_skip_with_offline_compiler); + for (size_t i = 0; i < nNotRequiredWithOfflineCompiler; ++i) { if (!strcmp(subtestname, subtests_to_skip_with_offline_compiler[i])) { - return 1; + return false; } } } - return 0; + return true; } diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h index 19446014..80eb3b58 100644 --- a/test_common/harness/errorHelpers.h +++ b/test_common/harness/errorHelpers.h @@ -56,17 +56,13 @@ static int vlog_win32(const char *format, ...); #define vlog printf #endif -#define ct_assert(b) ct_assert_i(b, __LINE__) -#define ct_assert_i(b, line) ct_assert_ii(b, line) -#define ct_assert_ii(b, line) \ - int _compile_time_assertion_on_line_##line[b ? 1 : -1]; - #define test_fail(msg, ...) \ { \ log_error(msg, ##__VA_ARGS__); \ return TEST_FAIL; \ } #define test_error(errCode, msg) test_error_ret(errCode, msg, errCode) +#define test_error_fail(errCode, msg) test_error_ret(errCode, msg, TEST_FAIL) #define test_error_ret(errCode, msg, retValue) \ { \ auto errCodeResult = errCode; \ @@ -97,21 +93,6 @@ static int vlog_win32(const char *format, ...); "the device version! (from %s:%d)\n", \ msg, __FILE__, __LINE__); -#define test_missing_support_offline_cmpiler(errCode, msg) \ - test_missing_support_offline_cmpiler_ret(errCode, msg, errCode) -// this macro should always return CL_SUCCESS, but print the skip message on -// test not supported with offline compiler -#define test_missing_support_offline_cmpiler_ret(errCode, msg, retValue) \ - { \ - if (errCode != CL_SUCCESS) \ - { \ - log_info("INFO: Subtest %s tests is not supported in offline " \ - "compiler execution path! (from %s:%d)\n", \ - msg, __FILE__, __LINE__); \ - return TEST_SKIP; \ - } \ - } - // expected error code vs. what we got #define test_failure_error(errCode, expectedErrCode, msg) \ test_failure_error_ret(errCode, expectedErrCode, msg, \ @@ -186,8 +167,7 @@ extern const char *GetAddressModeName(cl_addressing_mode mode); extern const char *GetQueuePropertyName(cl_command_queue_properties properties); extern const char *GetDeviceTypeName(cl_device_type type); -int check_functions_for_offline_compiler(const char *subtestname, - cl_device_id device); +bool check_functions_for_offline_compiler(const char *subtestname); cl_int OutputBuildLogs(cl_program program, cl_uint num_devices, cl_device_id *device_list); diff --git a/test_common/harness/fpcontrol.h b/test_common/harness/fpcontrol.h index 40826c5c..222aa2c4 100644 --- a/test_common/harness/fpcontrol.h +++ b/test_common/harness/fpcontrol.h @@ -16,6 +16,8 @@ #ifndef _fpcontrol_h #define _fpcontrol_h +#include <cstdint> + // In order to get tests for correctly rounded operations (e.g. multiply) to // work properly we need to be able to set the reference hardware to FTZ mode if // the device hardware is running in that mode. We have explored all other @@ -30,7 +32,11 @@ // that rounding mode. #if defined(__APPLE__) || defined(_MSC_VER) || defined(__linux__) \ || defined(__MINGW32__) +#ifdef _MSC_VER typedef int FPU_mode_type; +#else +typedef int64_t FPU_mode_type; +#endif #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ || defined(__MINGW32__) #include <xmmintrin.h> @@ -39,7 +45,7 @@ typedef int FPU_mode_type; extern __thread fpu_control_t fpu_control; #endif // Set the reference hardware floating point unit to FTZ mode -static inline void ForceFTZ(FPU_mode_type *mode) +inline void ForceFTZ(FPU_mode_type *mode) { #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ || defined(__MINGW32__) @@ -55,7 +61,7 @@ static inline void ForceFTZ(FPU_mode_type *mode) __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr | (1U << 24))); // Add 64 bit support #elif defined(__aarch64__) - unsigned fpscr; + uint64_t fpscr; __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); *mode = fpscr; __asm__ volatile("msr fpcr, %0" ::"r"(fpscr | (1U << 24))); @@ -65,7 +71,7 @@ static inline void ForceFTZ(FPU_mode_type *mode) } // Disable the denorm flush to zero -static inline void DisableFTZ(FPU_mode_type *mode) +inline void DisableFTZ(FPU_mode_type *mode) { #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ || defined(__MINGW32__) @@ -81,7 +87,7 @@ static inline void DisableFTZ(FPU_mode_type *mode) __asm__ volatile("fmxr fpscr, %0" ::"r"(fpscr & ~(1U << 24))); // Add 64 bit support #elif defined(__aarch64__) - unsigned fpscr; + uint64_t fpscr; __asm__ volatile("mrs %0, fpcr" : "=r"(fpscr)); *mode = fpscr; __asm__ volatile("msr fpcr, %0" ::"r"(fpscr & ~(1U << 24))); @@ -91,7 +97,7 @@ static inline void DisableFTZ(FPU_mode_type *mode) } // Restore the reference hardware to floating point state indicated by *mode -static inline void RestoreFPState(FPU_mode_type *mode) +inline void RestoreFPState(FPU_mode_type *mode) { #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) \ || defined(__MINGW32__) diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp index 72a2f0c0..f1694e88 100644 --- a/test_common/harness/imageHelpers.cpp +++ b/test_common/harness/imageHelpers.cpp @@ -23,6 +23,7 @@ #include <malloc.h> #endif #include <algorithm> +#include <cinttypes> #include <iterator> #if !defined(_WIN32) #include <cmath> @@ -421,7 +422,7 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel, (int)thirdDim, (int)imageInfo->rowPitch, (int)imageInfo->rowPitch - (int)imageInfo->width * (int)pixel_size); - log_error("Failed at column: %ld ", where); + log_error("Failed at column: %zu ", where); switch (pixel_size) { @@ -454,7 +455,7 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel, ((cl_ushort *)destPixel)[1], ((cl_ushort *)destPixel)[2]); break; case 8: - log_error("*0x%16.16llx vs. 0x%16.16llx\n", + log_error("*0x%16.16" PRIx64 " vs. 0x%16.16" PRIx64 "\n", ((cl_ulong *)sourcePixel)[0], ((cl_ulong *)destPixel)[0]); break; case 12: @@ -473,12 +474,53 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel, ((cl_uint *)destPixel)[2], ((cl_uint *)destPixel)[3]); break; default: - log_error("Don't know how to print pixel size of %ld\n", + log_error("Don't know how to print pixel size of %zu\n", pixel_size); break; } } +size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr, + const char *bPtr) +{ + size_t pixel_size = get_pixel_size(imageInfo->format); + size_t column; + + for (column = 0; column < imageInfo->width; column++) + { + switch (imageInfo->format->image_channel_data_type) + { + // If the data type is 101010, then ignore bits 31 and 32 when + // comparing the row + case CL_UNORM_INT_101010: { + cl_uint aPixel = *(cl_uint *)aPtr; + cl_uint bPixel = *(cl_uint *)bPtr; + if ((aPixel & 0x3fffffff) != (bPixel & 0x3fffffff)) + return column; + } + break; + + // If the data type is 555, ignore bit 15 when comparing the row + case CL_UNORM_SHORT_555: { + cl_ushort aPixel = *(cl_ushort *)aPtr; + cl_ushort bPixel = *(cl_ushort *)bPtr; + if ((aPixel & 0x7fff) != (bPixel & 0x7fff)) return column; + } + break; + + default: + if (memcmp(aPtr, bPtr, pixel_size) != 0) return column; + break; + } + + aPtr += pixel_size; + bPtr += pixel_size; + } + + // If we didn't find a difference, return the width of the image + return column; +} + int random_log_in_range(int minV, int maxV, MTdata d) { double v = log2(((double)genrand_int32(d) / (double)0xffffffff) + 1); @@ -554,8 +596,8 @@ struct AddressingTable { AddressingTable() { - ct_assert((CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6)); - ct_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2); + static_assert(CL_ADDRESS_MIRRORED_REPEAT - CL_ADDRESS_NONE < 6, ""); + static_assert(CL_FILTER_NEAREST - CL_FILTER_LINEAR < 2, ""); mTable[CL_ADDRESS_NONE - CL_ADDRESS_NONE] [CL_FILTER_NEAREST - CL_FILTER_NEAREST] = NoAddressFn; @@ -649,9 +691,6 @@ int has_alpha(const cl_image_format *format) _b ^= _a; \ _a ^= _b; \ } while (0) -#ifndef MAX -#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b)) -#endif void get_max_sizes( size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3], @@ -719,7 +758,7 @@ void get_max_sizes( if (usingMaxPixelSizeBuffer || raw_pixel_size == 12) raw_pixel_size = 16; size_t max_pixels = (size_t)maxAllocSize / raw_pixel_size; - log_info("Maximums: [%ld x %ld x %ld], raw pixel size %lu bytes, " + log_info("Maximums: [%zu x %zu x %zu], raw pixel size %zu bytes, " "per-allocation limit %gMB.\n", maxWidth, maxHeight, isArray ? maxArraySize : maxDepth, raw_pixel_size, (maxAllocSize / (1024.0 * 1024.0))); @@ -760,10 +799,10 @@ void get_max_sizes( if (image_type == CL_MEM_OBJECT_IMAGE1D) { - double M = maximum_sizes[0]; + size_t M = maximum_sizes[0]; // Store the size - sizes[(*numberOfSizes)][0] = (size_t)M; + sizes[(*numberOfSizes)][0] = M; sizes[(*numberOfSizes)][1] = 1; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); @@ -777,17 +816,17 @@ void get_max_sizes( { // Determine the size of the fixed dimension - double M = maximum_sizes[fixed_dim]; - double A = max_pixels; + size_t M = maximum_sizes[fixed_dim]; + size_t A = max_pixels; int x0_dim = !fixed_dim; - double x0 = + size_t x0 = static_cast<size_t>( fmin(fmin(other_sizes[(other_size++) % num_other_sizes], A / M), - maximum_sizes[x0_dim]); + maximum_sizes[x0_dim])); // Store the size - sizes[(*numberOfSizes)][fixed_dim] = (size_t)M; - sizes[(*numberOfSizes)][x0_dim] = (size_t)x0; + sizes[(*numberOfSizes)][fixed_dim] = M; + sizes[(*numberOfSizes)][x0_dim] = x0; sizes[(*numberOfSizes)][2] = 1; ++(*numberOfSizes); } @@ -802,16 +841,17 @@ void get_max_sizes( { // Determine the size of the fixed dimension - double M = maximum_sizes[fixed_dim]; - double A = max_pixels; + size_t M = maximum_sizes[fixed_dim]; + size_t A = max_pixels; // Find two other dimensions, x0 and x1 int x0_dim = (fixed_dim == 0) ? 1 : 0; int x1_dim = (fixed_dim == 2) ? 1 : 2; // Choose two other sizes for these dimensions - double x0 = fmin(fmin(A / M, maximum_sizes[x0_dim]), - other_sizes[(other_size++) % num_other_sizes]); + size_t x0 = static_cast<size_t>( + fmin(fmin(A / M, maximum_sizes[x0_dim]), + other_sizes[(other_size++) % num_other_sizes])); // GPUs have certain restrictions on minimum width (row alignment) // of images which has given us issues testing small widths in this // test (say we set width to 3 for testing, and compute size based @@ -820,8 +860,9 @@ void get_max_sizes( // width of 16 which doesnt fit in vram). For this purpose we are // not testing width < 16 for this test. if (x0_dim == 0 && x0 < 16) x0 = 16; - double x1 = fmin(fmin(A / M / x0, maximum_sizes[x1_dim]), - other_sizes[(other_size++) % num_other_sizes]); + size_t x1 = static_cast<size_t>( + fmin(fmin(A / M / x0, maximum_sizes[x1_dim]), + other_sizes[(other_size++) % num_other_sizes])); // Valid image sizes cannot be below 1. Due to the workaround for // the xo_dim where x0 is overidden to 16 there might not be enough @@ -834,9 +875,9 @@ void get_max_sizes( assert(x0 > 0 && M > 0); // Store the size - sizes[(*numberOfSizes)][fixed_dim] = (size_t)M; - sizes[(*numberOfSizes)][x0_dim] = (size_t)x0; - sizes[(*numberOfSizes)][x1_dim] = (size_t)x1; + sizes[(*numberOfSizes)][fixed_dim] = M; + sizes[(*numberOfSizes)][x0_dim] = x0; + sizes[(*numberOfSizes)][x1_dim] = x1; ++(*numberOfSizes); } } @@ -847,20 +888,20 @@ void get_max_sizes( switch (image_type) { case CL_MEM_OBJECT_IMAGE1D: - log_info(" size[%d] = [%ld] (%g MB image)\n", j, sizes[j][0], + log_info(" size[%d] = [%zu] (%g MB image)\n", j, sizes[j][0], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); break; case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D: - log_info(" size[%d] = [%ld %ld] (%g MB image)\n", j, + log_info(" size[%d] = [%zu %zu] (%g MB image)\n", j, sizes[j][0], sizes[j][1], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: case CL_MEM_OBJECT_IMAGE3D: - log_info(" size[%d] = [%ld %ld %ld] (%g MB image)\n", j, + log_info(" size[%d] = [%zu %zu %zu] (%g MB image)\n", j, sizes[j][0], sizes[j][1], sizes[j][2], raw_pixel_size * sizes[j][0] * sizes[j][1] * sizes[j][2] / (1024.0 * 1024.0)); @@ -884,6 +925,8 @@ float get_max_absolute_error(const cl_image_format *format, #ifdef CL_SFIXED14_APPLE case CL_SFIXED14_APPLE: return 0x1.0p-14f; #endif + case CL_UNORM_SHORT_555: + case CL_UNORM_SHORT_565: return 1.0f / 31.0f; default: return 0.0f; } } @@ -1124,12 +1167,13 @@ void escape_inf_nan_values(char *data, size_t allocSize) char *generate_random_image_data(image_descriptor *imageInfo, BufferOwningPtr<char> &P, MTdata d) { - size_t allocSize = get_image_size(imageInfo); + size_t allocSize = static_cast<size_t>(get_image_size(imageInfo)); size_t pixelRowBytes = imageInfo->width * get_pixel_size(imageInfo->format); size_t i; if (imageInfo->num_mip_levels > 1) - allocSize = compute_mipmapped_image_size(*imageInfo); + allocSize = + static_cast<size_t>(compute_mipmapped_image_size(*imageInfo)); #if defined(__APPLE__) char *data = NULL; @@ -1161,7 +1205,7 @@ char *generate_random_image_data(image_descriptor *imageInfo, if (data == NULL) { - log_error("ERROR: Unable to malloc %lu bytes for " + log_error("ERROR: Unable to malloc %zu bytes for " "generate_random_image_data\n", allocSize); return 0; @@ -1678,24 +1722,26 @@ bool get_integer_coords_offset(float x, float y, float z, float xAddressOffset, // At this point, we're dealing with non-normalized coordinates. - outX = adFn(floorf(x), width); + outX = adFn(static_cast<int>(floorf(x)), width); // 1D and 2D arrays require special care for the index coordinate: switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: - outY = calculate_array_index(y, (float)imageInfo->arraySize - 1.0f); - outZ = 0.0f; /* don't care! */ + outY = static_cast<int>( + calculate_array_index(y, (float)imageInfo->arraySize - 1.0f)); + outZ = 0; /* don't care! */ break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: - outY = adFn(floorf(y), height); - outZ = calculate_array_index(z, (float)imageInfo->arraySize - 1.0f); + outY = adFn(static_cast<int>(floorf(y)), height); + outZ = static_cast<int>( + calculate_array_index(z, (float)imageInfo->arraySize - 1.0f)); break; default: // legacy path: - if (height != 0) outY = adFn(floorf(y), height); - if (depth != 0) outZ = adFn(floorf(z), depth); + if (height != 0) outY = adFn(static_cast<int>(floorf(y)), height); + if (depth != 0) outZ = adFn(static_cast<int>(floorf(z)), depth); } return !((int)refX == outX && (int)refY == outY && (int)refZ == outZ); @@ -1766,7 +1812,7 @@ static float unnormalize_coordinate(const char *name, float coord, float offset, switch (addressing_mode) { case CL_ADDRESS_REPEAT: - ret = RepeatNormalizedAddressFn(coord, extent); + ret = RepeatNormalizedAddressFn(coord, static_cast<size_t>(extent)); if (verbose) { @@ -1790,7 +1836,8 @@ static float unnormalize_coordinate(const char *name, float coord, float offset, break; case CL_ADDRESS_MIRRORED_REPEAT: - ret = MirroredRepeatNormalizedAddressFn(coord, extent); + ret = MirroredRepeatNormalizedAddressFn( + coord, static_cast<size_t>(extent)); if (verbose) { @@ -1948,7 +1995,7 @@ FloatPixel sample_image_pixel_float_offset( break; case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE1D_BUFFER: - log_info("Starting coordinate: %f\b", x); + log_info("Starting coordinate: %f\n", x); break; case CL_MEM_OBJECT_IMAGE2D: log_info("Starting coordinate: %f, %f\n", x, y); @@ -1968,13 +2015,13 @@ FloatPixel sample_image_pixel_float_offset( // coordinates. Note that the array cases again require special // care, per section 8.4 in the OpenCL 1.2 Specification. - ix = adFn(floorf(x), width_lod); + ix = adFn(static_cast<int>(floorf(x)), width_lod); switch (imageInfo->type) { case CL_MEM_OBJECT_IMAGE1D_ARRAY: - iy = - calculate_array_index(y, (float)(imageInfo->arraySize - 1)); + iy = static_cast<int>(calculate_array_index( + y, (float)(imageInfo->arraySize - 1))); iz = 0; if (verbose) { @@ -1982,18 +2029,18 @@ FloatPixel sample_image_pixel_float_offset( } break; case CL_MEM_OBJECT_IMAGE2D_ARRAY: - iy = adFn(floorf(y), height_lod); - iz = - calculate_array_index(z, (float)(imageInfo->arraySize - 1)); + iy = adFn(static_cast<int>(floorf(y)), height_lod); + iz = static_cast<int>(calculate_array_index( + z, (float)(imageInfo->arraySize - 1))); if (verbose) { log_info("\tArray index %f evaluates to %d\n", z, iz); } break; default: - iy = adFn(floorf(y), height_lod); + iy = adFn(static_cast<int>(floorf(y)), height_lod); if (depth_lod != 0) - iz = adFn(floorf(z), depth_lod); + iz = adFn(static_cast<int>(floorf(z)), depth_lod); else iz = 0; } @@ -2047,16 +2094,16 @@ FloatPixel sample_image_pixel_float_offset( height = 1; } - int x1 = adFn(floorf(x - 0.5f), width); + int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width); int y1 = 0; - int x2 = adFn(floorf(x - 0.5f) + 1, width); + int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width); int y2 = 0; if ((imageInfo->type != CL_MEM_OBJECT_IMAGE1D) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) && (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_BUFFER)) { - y1 = adFn(floorf(y - 0.5f), height); - y2 = adFn(floorf(y - 0.5f) + 1, height); + y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height); + y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height); } else { @@ -2147,12 +2194,12 @@ FloatPixel sample_image_pixel_float_offset( else { // 3D linear filtering - int x1 = adFn(floorf(x - 0.5f), width_lod); - int y1 = adFn(floorf(y - 0.5f), height_lod); - int z1 = adFn(floorf(z - 0.5f), depth_lod); - int x2 = adFn(floorf(x - 0.5f) + 1, width_lod); - int y2 = adFn(floorf(y - 0.5f) + 1, height_lod); - int z2 = adFn(floorf(z - 0.5f) + 1, depth_lod); + int x1 = adFn(static_cast<int>(floorf(x - 0.5f)), width_lod); + int y1 = adFn(static_cast<int>(floorf(y - 0.5f)), height_lod); + int z1 = adFn(static_cast<int>(floorf(z - 0.5f)), depth_lod); + int x2 = adFn(static_cast<int>(floorf(x - 0.5f) + 1), width_lod); + int y2 = adFn(static_cast<int>(floorf(y - 0.5f) + 1), height_lod); + int z2 = adFn(static_cast<int>(floorf(z - 0.5f) + 1), depth_lod); if (verbose) log_info("\tActual integer coords used (i = floor(x-.5)): " @@ -2580,11 +2627,11 @@ void pack_image_pixel(int *srcVector, const cl_image_format *imageFormat, } } -int round_to_even(float v) +cl_int round_to_even(float v) { // clamp overflow - if (v >= -(float)INT_MIN) return INT_MAX; - if (v <= (float)INT_MIN) return INT_MIN; + if (v >= -(float)CL_INT_MIN) return CL_INT_MAX; + if (v <= (float)CL_INT_MIN) return CL_INT_MIN; // round fractional values to integer value if (fabsf(v) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23)) @@ -2596,7 +2643,7 @@ int round_to_even(float v) v -= magicVal; } - return (int)v; + return (cl_int)v; } void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat, @@ -2721,10 +2768,7 @@ void pack_image_pixel(float *srcVector, const cl_image_format *imageFormat, case CL_SIGNED_INT32: { cl_int *ptr = (cl_int *)outData; for (unsigned int i = 0; i < channelCount; i++) - ptr[i] = (int)CONVERT_INT( - srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31), - MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23), - CL_INT_MAX); + ptr[i] = round_to_even(srcVector[i]); break; } case CL_UNSIGNED_INT8: { @@ -2888,26 +2932,25 @@ void pack_image_pixel_error(const float *srcVector, case CL_SIGNED_INT32: { const cl_int *ptr = (const cl_int *)results; for (unsigned int i = 0; i < channelCount; i++) - errors[i] = (cl_float)( - (cl_long)ptr[i] - - (cl_long)CONVERT_INT( - srcVector[i], MAKE_HEX_FLOAT(-0x1.0p31f, -1, 31), - MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffe, 30 - 23), - CL_INT_MAX)); + errors[i] = (cl_float)((cl_long)ptr[i] + - (cl_long)round_to_even(srcVector[i])); break; } case CL_UNSIGNED_INT8: { const cl_uchar *ptr = (const cl_uchar *)results; for (unsigned int i = 0; i < channelCount; i++) - errors[i] = (cl_int)ptr[i] - - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX); + errors[i] = static_cast<float>( + (cl_int)ptr[i] + - (cl_int)CONVERT_UINT(srcVector[i], 255.f, CL_UCHAR_MAX)); break; } case CL_UNSIGNED_INT16: { const cl_ushort *ptr = (const cl_ushort *)results; for (unsigned int i = 0; i < channelCount; i++) - errors[i] = (cl_int)ptr[i] - - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, CL_USHRT_MAX); + errors[i] = static_cast<float>( + (cl_int)ptr[i] + - (cl_int)CONVERT_UINT(srcVector[i], 32767.f, + CL_USHRT_MAX)); break; } case CL_UNSIGNED_INT32: { @@ -3228,7 +3271,7 @@ char *create_random_image_data(ExplicitType dataType, if (data == NULL) { log_error( - "ERROR: Unable to malloc %lu bytes for create_random_image_data\n", + "ERROR: Unable to malloc %zu bytes for create_random_image_data\n", allocSize); return NULL; } @@ -3988,7 +4031,8 @@ bool is_image_format_required(cl_image_format format, cl_mem_flags flags, cl_uint compute_max_mip_levels(size_t width, size_t height, size_t depth) { - cl_uint retMaxMipLevels = 0, max_dim = 0; + cl_uint retMaxMipLevels = 0; + size_t max_dim = 0; max_dim = width; max_dim = height > max_dim ? height : max_dim; diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h index 848ec655..f8ae4fb9 100644 --- a/test_common/harness/imageHelpers.h +++ b/test_common/harness/imageHelpers.h @@ -63,7 +63,7 @@ typedef struct bool normalized_coords; } image_sampler_data; -int round_to_even(float v); +cl_int round_to_even(float v); #define NORMALIZE(v, max) (v < 0 ? 0 : (v > 1.f ? max : round_to_even(v * max))) #define NORMALIZE_UNROUNDED(v, max) (v < 0 ? 0 : (v > 1.f ? max : v * max)) @@ -139,6 +139,9 @@ void print_first_pixel_difference_error(size_t where, const char *sourcePixel, image_descriptor *imageInfo, size_t y, size_t thirdDim); +size_t compare_scanlines(const image_descriptor *imageInfo, const char *aPtr, + const char *bPtr); + void get_max_sizes(size_t *numberOfSizes, const int maxNumberOfSizes, size_t sizes[][3], size_t maxWidth, size_t maxHeight, size_t maxDepth, size_t maxArraySize, @@ -479,6 +482,13 @@ void read_image_pixel(void *imageData, image_descriptor *imageInfo, int x, outData[2] = tempData[3]; outData[3] = tempData[0]; } + else if (format->image_channel_order == CL_ABGR) + { + outData[0] = tempData[3]; + outData[1] = tempData[2]; + outData[2] = tempData[1]; + outData[3] = tempData[0]; + } else if ((format->image_channel_order == CL_BGRA) || (format->image_channel_order == CL_sBGRA)) { diff --git a/test_common/harness/integer_ops_test_info.h b/test_common/harness/integer_ops_test_info.h new file mode 100644 index 00000000..ad7b303b --- /dev/null +++ b/test_common/harness/integer_ops_test_info.h @@ -0,0 +1,92 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef INTEGER_OPS_TEST_INFO_H +#define INTEGER_OPS_TEST_INFO_H + +#include "conversions.h" +#include "testHarness.h" + +// TODO: expand usage to other tests. + +template <typename T> struct TestInfo +{ +}; +template <> struct TestInfo<cl_char> +{ + static const ExplicitType explicitType = kChar; + static constexpr const char* deviceTypeName = "char"; + static constexpr const char* deviceTypeNameSigned = "char"; + static constexpr const char* deviceTypeNameUnsigned = "uchar"; +}; +template <> struct TestInfo<cl_uchar> +{ + static const ExplicitType explicitType = kUChar; + static constexpr const char* deviceTypeName = "uchar"; + static constexpr const char* deviceTypeNameSigned = "char"; + static constexpr const char* deviceTypeNameUnsigned = "uchar"; +}; +template <> struct TestInfo<cl_short> +{ + static const ExplicitType explicitType = kShort; + static constexpr const char* deviceTypeName = "short"; + static constexpr const char* deviceTypeNameSigned = "short"; + static constexpr const char* deviceTypeNameUnsigned = "ushort"; +}; +template <> struct TestInfo<cl_ushort> +{ + static const ExplicitType explicitType = kUShort; + static constexpr const char* deviceTypeName = "ushort"; + static constexpr const char* deviceTypeNameSigned = "short"; + static constexpr const char* deviceTypeNameUnsigned = "ushort"; +}; +template <> struct TestInfo<cl_int> +{ + static const ExplicitType explicitType = kInt; + static constexpr const char* deviceTypeName = "int"; + static constexpr const char* deviceTypeNameSigned = "int"; + static constexpr const char* deviceTypeNameUnsigned = "uint"; +}; +template <> struct TestInfo<cl_uint> +{ + static const ExplicitType explicitType = kUInt; + static constexpr const char* deviceTypeName = "uint"; + static constexpr const char* deviceTypeNameSigned = "int"; + static constexpr const char* deviceTypeNameUnsigned = "uint"; +}; +template <> struct TestInfo<cl_long> +{ + static const ExplicitType explicitType = kLong; + static constexpr const char* deviceTypeName = "long"; + static constexpr const char* deviceTypeNameSigned = "long"; + static constexpr const char* deviceTypeNameUnsigned = "ulong"; +}; +template <> struct TestInfo<cl_ulong> +{ + static const ExplicitType explicitType = kULong; + static constexpr const char* deviceTypeName = "ulong"; + static constexpr const char* deviceTypeNameSigned = "long"; + static constexpr const char* deviceTypeNameUnsigned = "ulong"; +}; + +template <typename T> +static void fill_vector_with_random_data(std::vector<T>& v) +{ + MTdataHolder d(gRandomSeed); + generate_random_data(TestInfo<T>::explicitType, v.size(), d, v.data()); +} + +#endif /* INTEGER_OPS_TEST_INFO_H */ diff --git a/test_common/harness/kernelHelpers.cpp b/test_common/harness/kernelHelpers.cpp index 95b9555e..13ebcbc9 100644 --- a/test_common/harness/kernelHelpers.cpp +++ b/test_common/harness/kernelHelpers.cpp @@ -530,7 +530,7 @@ static int get_offline_compiler_output( sourceFilename, outputFilename); if (error != CL_SUCCESS) return error; - // read output file + // open output file for reading ifs.open(outputFilename.c_str(), std::ios::binary); if (!ifs.good()) { @@ -540,6 +540,26 @@ static int get_offline_compiler_output( } } } + + if (compilationMode == kSpir_v && !gDisableSPIRVValidation) + { + std::string runString = gSPIRVValidator + " " + outputFilename; + + int returnCode = system(runString.c_str()); + if (returnCode == -1) + { + log_error("Error: failed to invoke SPIR-V validator\n"); + return CL_COMPILE_PROGRAM_FAILURE; + } + else if (returnCode != 0) + { + log_error( + "Failed to validate SPIR-V file %s: system() returned 0x%x\n", + outputFilename.c_str(), returnCode); + return CL_COMPILE_PROGRAM_FAILURE; + } + } + return CL_SUCCESS; } @@ -579,7 +599,7 @@ static int create_single_kernel_helper_create_program_offline( if (error != CL_SUCCESS) return error; ifs.seekg(0, ifs.end); - int length = ifs.tellg(); + size_t length = static_cast<size_t>(ifs.tellg()); ifs.seekg(0, ifs.beg); // treat modifiedProgram as input for clCreateProgramWithBinary @@ -1226,7 +1246,7 @@ int is_image_format_supported(cl_context context, cl_mem_flags flags, list = (cl_image_format *)malloc(count * sizeof(cl_image_format)); if (NULL == list) { - log_error("Error: unable to allocate %ld byte buffer for image format " + log_error("Error: unable to allocate %zu byte buffer for image format " "list at %s:%d (err = %d)\n", count * sizeof(cl_image_format), __FILE__, __LINE__, err); return 0; @@ -1641,8 +1661,10 @@ Version get_device_latest_cl_c_version(cl_device_id device) Version max_supported_cl_c_version{}; for (const auto &name_version : name_versions) { - Version current_version{ CL_VERSION_MAJOR(name_version.version), - CL_VERSION_MINOR(name_version.version) }; + Version current_version{ + static_cast<int>(CL_VERSION_MAJOR(name_version.version)), + static_cast<int>(CL_VERSION_MINOR(name_version.version)) + }; max_supported_cl_c_version = (current_version > max_supported_cl_c_version) ? current_version @@ -1687,7 +1709,7 @@ Version get_max_OpenCL_C_for_context(cl_context context) else { current_version = - (std::min)(device_version, current_version); + std::min(device_version, current_version); } }); return current_version; @@ -1725,8 +1747,10 @@ bool device_supports_cl_c_version(cl_device_id device, Version version) for (const auto &name_version : name_versions) { - Version current_version{ CL_VERSION_MAJOR(name_version.version), - CL_VERSION_MINOR(name_version.version) }; + Version current_version{ + static_cast<int>(CL_VERSION_MAJOR(name_version.version)), + static_cast<int>(CL_VERSION_MINOR(name_version.version)) + }; if (current_version == version) { return true; diff --git a/test_common/harness/mt19937.cpp b/test_common/harness/mt19937.cpp index c32d9bac..f5665deb 100644 --- a/test_common/harness/mt19937.cpp +++ b/test_common/harness/mt19937.cpp @@ -277,3 +277,5 @@ double genrand_res53(MTdata d) unsigned long a = genrand_int32(d) >> 5, b = genrand_int32(d) >> 6; return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); } + +bool genrand_bool(MTdata d) { return ((cl_uint)genrand_int32(d) & 1); } diff --git a/test_common/harness/mt19937.h b/test_common/harness/mt19937.h index 35c84933..447ca25a 100644 --- a/test_common/harness/mt19937.h +++ b/test_common/harness/mt19937.h @@ -90,24 +90,46 @@ double genrand_res53(MTdata /*data*/); #ifdef __cplusplus +/* generates a random boolean */ +bool genrand_bool(MTdata /*data*/); + #include <cassert> +#include <utility> -struct MTdataHolder -{ - MTdataHolder(cl_uint seed) +class MTdataHolder { +public: + MTdataHolder() = default; + explicit MTdataHolder(cl_uint seed) { m_mtdata = init_genrand(seed); assert(m_mtdata != nullptr); } - MTdataHolder(MTdata mtdata): m_mtdata(mtdata) {} + // Forbid copy. + MTdataHolder(const MTdataHolder&) = delete; + MTdataHolder& operator=(const MTdataHolder&) = delete; + + // Support move semantics. + MTdataHolder(MTdataHolder&& h) { std::swap(m_mtdata, h.m_mtdata); } + MTdataHolder& operator=(MTdataHolder&& h) + { + std::swap(m_mtdata, h.m_mtdata); + return *this; + } - ~MTdataHolder() { free_mtdata(m_mtdata); } + ~MTdataHolder() + { + if (m_mtdata) free_mtdata(m_mtdata); + } - operator MTdata() const { return m_mtdata; } + operator MTdata() const + { + assert(m_mtdata && "Object wasn't initialised"); + return m_mtdata; + } private: - MTdata m_mtdata; + MTdata m_mtdata = nullptr; }; #endif // #ifdef __cplusplus diff --git a/test_common/harness/os_helpers.cpp b/test_common/harness/os_helpers.cpp index cd350cf8..8fc91108 100644 --- a/test_common/harness/os_helpers.cpp +++ b/test_common/harness/os_helpers.cpp @@ -333,9 +333,6 @@ std::string exe_dir() #include <windows.h> -#if defined(max) -#undef max -#endif #include <cctype> #include <algorithm> @@ -404,7 +401,8 @@ std::string exe_path() for (;;) { - DWORD len = GetModuleFileNameA(NULL, &path.front(), path.size()); + DWORD len = GetModuleFileNameA(NULL, &path.front(), + static_cast<DWORD>(path.size())); if (len == 0) { diff --git a/test_common/harness/parseParameters.cpp b/test_common/harness/parseParameters.cpp index b2ab5b02..e946d744 100644 --- a/test_common/harness/parseParameters.cpp +++ b/test_common/harness/parseParameters.cpp @@ -28,11 +28,14 @@ using namespace std; #define DEFAULT_COMPILATION_PROGRAM "cl_offline_compiler" +#define DEFAULT_SPIRV_VALIDATOR "spirv-val" CompilationMode gCompilationMode = kOnline; CompilationCacheMode gCompilationCacheMode = kCacheModeCompileIfAbsent; std::string gCompilationCachePath = "."; std::string gCompilationProgram = DEFAULT_COMPILATION_PROGRAM; +bool gDisableSPIRVValidation = false; +std::string gSPIRVValidator = DEFAULT_SPIRV_VALIDATOR; void helpInfo() { @@ -62,7 +65,14 @@ For offline compilation (binary and spir-v modes) only: Path for offline compiler output and CL source --compilation-program <prog> Program to use for offline compilation, defaults to: - )" DEFAULT_COMPILATION_PROGRAM "\n\n"); + )" DEFAULT_COMPILATION_PROGRAM R"( + +For spir-v mode only: + --disable-spirv-validation + Disable validation of SPIR-V using the SPIR-V validator + --spirv-validator + Path for SPIR-V validator, defaults to )" DEFAULT_SPIRV_VALIDATOR "\n" + "\n"); } int parseCustomParam(int argc, const char *argv[], const char *ignore) @@ -198,6 +208,26 @@ int parseCustomParam(int argc, const char *argv[], const char *ignore) return -1; } } + else if (!strcmp(argv[i], "--disable-spirv-validation")) + { + delArg++; + gDisableSPIRVValidation = true; + } + else if (!strcmp(argv[i], "--spirv-validator")) + { + delArg++; + if ((i + 1) < argc) + { + delArg++; + gSPIRVValidator = argv[i + 1]; + } + else + { + log_error("Program argument for --spirv-validator was not " + "specified.\n"); + return -1; + } + } // cleaning parameters from argv tab for (int j = i; j < argc - delArg; j++) argv[j] = argv[j + delArg]; diff --git a/test_common/harness/parseParameters.h b/test_common/harness/parseParameters.h index b0f8328a..437e12f9 100644 --- a/test_common/harness/parseParameters.h +++ b/test_common/harness/parseParameters.h @@ -38,6 +38,8 @@ extern CompilationMode gCompilationMode; extern CompilationCacheMode gCompilationCacheMode; extern std::string gCompilationCachePath; extern std::string gCompilationProgram; +extern bool gDisableSPIRVValidation; +extern std::string gSPIRVValidator; extern int parseCustomParam(int argc, const char *argv[], const char *ignore = 0); diff --git a/test_common/harness/propertyHelpers.cpp b/test_common/harness/propertyHelpers.cpp index 3157ca80..6a10c076 100644 --- a/test_common/harness/propertyHelpers.cpp +++ b/test_common/harness/propertyHelpers.cpp @@ -19,6 +19,7 @@ #include <assert.h> #include <algorithm> +#include <cinttypes> #include <vector> static bool findProperty(const std::vector<cl_properties>& props, @@ -97,14 +98,15 @@ int compareProperties(const std::vector<cl_properties>& queried, if (!found) { - log_error("ERROR: expected property 0x%x not found!\n", + log_error("ERROR: expected property 0x%" PRIx64 " not found!\n", check_prop); return TEST_FAIL; } else if (check_value != queried_value) { - log_error("ERROR: mis-matched value for property 0x%x: wanted " - "0x%x, got 0x%x\n", + log_error("ERROR: mis-matched value for property 0x%" PRIx64 + ": wanted " + "0x%" PRIx64 ", got 0x%" PRIx64 "\n", check_prop, check_value, queried_value); return TEST_FAIL; } @@ -113,7 +115,7 @@ int compareProperties(const std::vector<cl_properties>& queried, if (queried.size() > check.size()) { log_error("ERROR: all properties found but there are extra " - "properties: expected %d, got %d.\n", + "properties: expected %zu, got %zu.\n", check.size(), queried.size()); return TEST_FAIL; } diff --git a/test_common/harness/rounding_mode.cpp b/test_common/harness/rounding_mode.cpp index 681ccdd8..1f531478 100644 --- a/test_common/harness/rounding_mode.cpp +++ b/test_common/harness/rounding_mode.cpp @@ -48,7 +48,7 @@ RoundingMode set_round(RoundingMode r, Type outType) const int *p = int_rounds; if (outType == kfloat || outType == kdouble) p = flt_rounds; - int fpscr = 0; + int64_t fpscr = 0; RoundingMode oldRound = get_round(); _FPU_GETCW(fpscr); @@ -59,7 +59,7 @@ RoundingMode set_round(RoundingMode r, Type outType) RoundingMode get_round(void) { - int fpscr; + int64_t fpscr; int oldRound; _FPU_GETCW(fpscr); @@ -203,13 +203,13 @@ void *FlushToZero(void) #if defined(__APPLE__) || defined(__linux__) || defined(_WIN32) #if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) union { - int i; + unsigned int i; void *p; } u = { _mm_getcsr() }; _mm_setcsr(u.i | 0x8040); return u.p; #elif defined(__arm__) || defined(__aarch64__) - int fpscr; + int64_t fpscr; _FPU_GETCW(fpscr); _FPU_SETCW(fpscr | FPSCR_FZ); return NULL; @@ -239,7 +239,7 @@ void UnFlushToZero(void *p) } u = { p }; _mm_setcsr(u.i); #elif defined(__arm__) || defined(__aarch64__) - int fpscr; + int64_t fpscr; _FPU_GETCW(fpscr); _FPU_SETCW(fpscr & ~FPSCR_FZ); #elif defined(__PPC__) diff --git a/test_common/harness/rounding_mode.h b/test_common/harness/rounding_mode.h index 064a3a63..6f52f0a0 100644 --- a/test_common/harness/rounding_mode.h +++ b/test_common/harness/rounding_mode.h @@ -16,8 +16,6 @@ #ifndef __ROUNDING_MODE_H__ #define __ROUNDING_MODE_H__ -#pragma STDC FENV_ACCESS ON - #include "compat.h" #if (defined(_WIN32) && defined(_MSC_VER)) diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp index 1aec3d07..a309f53d 100644 --- a/test_common/harness/testHarness.cpp +++ b/test_common/harness/testHarness.cpp @@ -60,6 +60,54 @@ bool gCoreILProgram = true; #define DEFAULT_NUM_ELEMENTS 0x4000 +static int saveResultsToJson(const char *suiteName, test_definition testList[], + unsigned char selectedTestList[], + test_status resultTestList[], int testNum) +{ + char *fileName = getenv("CL_CONFORMANCE_RESULTS_FILENAME"); + if (fileName == nullptr) + { + return EXIT_SUCCESS; + } + + FILE *file = fopen(fileName, "w"); + if (NULL == file) + { + log_error("ERROR: Failed to open '%s' for writing results.\n", + fileName); + return EXIT_FAILURE; + } + + const char *save_map[] = { "success", "failure" }; + const char *result_map[] = { "pass", "fail", "skip" }; + const char *linebreak[] = { "", ",\n" }; + int add_linebreak = 0; + + fprintf(file, "{\n"); + fprintf(file, "\t\"cmd\": \"%s\",\n", suiteName); + fprintf(file, "\t\"results\": {\n"); + + for (int i = 0; i < testNum; ++i) + { + if (selectedTestList[i]) + { + fprintf(file, "%s\t\t\"%s\": \"%s\"", linebreak[add_linebreak], + testList[i].name, result_map[(int)resultTestList[i]]); + add_linebreak = 1; + } + } + fprintf(file, "\n"); + + fprintf(file, "\t}\n"); + fprintf(file, "}\n"); + + int ret = fclose(file) ? EXIT_FAILURE : EXIT_SUCCESS; + + log_info("Saving results to %s: %s!\n", fileName, save_map[ret]); + + return ret; +} + int runTestHarness(int argc, const char *argv[], int testNum, test_definition testList[], int forceNoContextCreation, cl_command_queue_properties queueProps) @@ -68,19 +116,28 @@ int runTestHarness(int argc, const char *argv[], int testNum, forceNoContextCreation, queueProps, NULL); } -int skip_init_info(int count) +int suite_did_not_pass_init(const char *suiteName, test_status status, + int testNum, test_definition testList[]) { - log_info("Test skipped while initialization\n"); - log_info("SKIPPED %d of %d tests.\n", count, count); - return EXIT_SUCCESS; -} + std::vector<unsigned char> selectedTestList(testNum, 1); + std::vector<test_status> resultTestList(testNum, status); -int fail_init_info(int count) -{ - log_info("Test failed while initialization\n"); - log_info("FAILED %d of %d tests.\n", count, count); - return EXIT_FAILURE; + int ret = saveResultsToJson(suiteName, testList, selectedTestList.data(), + resultTestList.data(), testNum); + + log_info("Test %s while initialization\n", + status == TEST_SKIP ? "skipped" : "failed"); + log_info("%s %d of %d tests.\n", status == TEST_SKIP ? "SKIPPED" : "FAILED", + testNum, testNum); + + if (ret != EXIT_SUCCESS) + { + return ret; + } + + return status == TEST_SKIP ? EXIT_SUCCESS : EXIT_FAILURE; } + void version_expected_info(const char *test_name, const char *api_name, const char *expected_version, const char *device_version) @@ -470,6 +527,7 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum, log_error("Invalid device address bit size returned by device.\n"); return EXIT_FAILURE; } + const char *suiteName = argv[0]; if (gCompilationMode == kSpir_v) { test_status spirv_readiness = check_spirv_compilation_readiness(device); @@ -478,9 +536,15 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum, switch (spirv_readiness) { case TEST_PASS: break; - case TEST_FAIL: return fail_init_info(testNum); - case TEST_SKIP: return skip_init_info(testNum); - case TEST_SKIPPED_ITSELF: return skip_init_info(testNum); + case TEST_FAIL: + return suite_did_not_pass_init(suiteName, TEST_FAIL, + testNum, testList); + case TEST_SKIP: + return suite_did_not_pass_init(suiteName, TEST_SKIP, + testNum, testList); + case TEST_SKIPPED_ITSELF: + return suite_did_not_pass_init(suiteName, TEST_SKIP, + testNum, testList); } } } @@ -492,9 +556,15 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum, switch (status) { case TEST_PASS: break; - case TEST_FAIL: return fail_init_info(testNum); - case TEST_SKIP: return skip_init_info(testNum); - case TEST_SKIPPED_ITSELF: return skip_init_info(testNum); + case TEST_FAIL: + return suite_did_not_pass_init(suiteName, TEST_FAIL, testNum, + testList); + case TEST_SKIP: + return suite_did_not_pass_init(suiteName, TEST_SKIP, testNum, + testList); + case TEST_SKIPPED_ITSELF: + return suite_did_not_pass_init(suiteName, TEST_SKIP, testNum, + testList); } } @@ -574,49 +644,6 @@ static int find_matching_tests(test_definition testList[], return EXIT_SUCCESS; } -static int saveResultsToJson(const char *fileName, const char *suiteName, - test_definition testList[], - unsigned char selectedTestList[], - test_status resultTestList[], int testNum) -{ - FILE *file = fopen(fileName, "w"); - if (NULL == file) - { - log_error("ERROR: Failed to open '%s' for writing results.\n", - fileName); - return EXIT_FAILURE; - } - - const char *save_map[] = { "success", "failure" }; - const char *result_map[] = { "pass", "fail", "skip" }; - const char *linebreak[] = { "", ",\n" }; - int add_linebreak = 0; - - fprintf(file, "{\n"); - fprintf(file, "\t\"cmd\": \"%s\",\n", suiteName); - fprintf(file, "\t\"results\": {\n"); - - for (int i = 0; i < testNum; ++i) - { - if (selectedTestList[i]) - { - fprintf(file, "%s\t\t\"%s\": \"%s\"", linebreak[add_linebreak], - testList[i].name, result_map[(int)resultTestList[i]]); - add_linebreak = 1; - } - } - fprintf(file, "\n"); - - fprintf(file, "\t}\n"); - fprintf(file, "}\n"); - - int ret = fclose(file) ? 1 : 0; - - log_info("Saving results to %s: %s!\n", fileName, save_map[ret]); - - return ret; -} - static void print_results(int failed, int count, const char *name) { if (count < failed) @@ -658,7 +685,6 @@ int parseAndCallCommandLineTests(int argc, const char *argv[], int ret = EXIT_SUCCESS; unsigned char *selectedTestList = (unsigned char *)calloc(testNum, 1); - test_status *resultTestList = NULL; if (argc == 1) { @@ -697,24 +723,19 @@ int parseAndCallCommandLineTests(int argc, const char *argv[], if (ret == EXIT_SUCCESS) { - resultTestList = - (test_status *)calloc(testNum, sizeof(*resultTestList)); + std::vector<test_status> resultTestList(testNum, TEST_PASS); - callTestFunctions(testList, selectedTestList, resultTestList, testNum, - device, forceNoContextCreation, num_elements, + callTestFunctions(testList, selectedTestList, resultTestList.data(), + testNum, device, forceNoContextCreation, num_elements, queueProps); print_results(gFailCount, gTestCount, "sub-test"); print_results(gTestsFailed, gTestsFailed + gTestsPassed, "test"); - char *filename = getenv("CL_CONFORMANCE_RESULTS_FILENAME"); - if (filename != NULL) - { - ret = saveResultsToJson(filename, argv[0], testList, - selectedTestList, resultTestList, testNum); - } + ret = saveResultsToJson(argv[0], testList, selectedTestList, + resultTestList.data(), testNum); - if (std::any_of(resultTestList, resultTestList + testNum, + if (std::any_of(resultTestList.begin(), resultTestList.end(), [](test_status result) { switch (result) { @@ -730,7 +751,6 @@ int parseAndCallCommandLineTests(int argc, const char *argv[], } free(selectedTestList); - free(resultTestList); return ret; } @@ -783,6 +803,14 @@ test_status callSingleTestFunction(test_definition test, return TEST_SKIP; } + if (!check_functions_for_offline_compiler(test.name)) + { + log_info("Subtest %s tests is not supported in offline compiler " + "execution path!\n", + test.name); + return TEST_SKIP; + } + /* Create a context to work with, unless we're told not to */ if (!forceNoContextCreation) { @@ -812,14 +840,12 @@ test_status callSingleTestFunction(test_definition test, if (queue == NULL) { print_error(error, "Unable to create testing command queue"); + clReleaseContext(context); return TEST_FAIL; } } /* Run the test and print the result */ - error = check_functions_for_offline_compiler(test.name, deviceToUse); - test_missing_support_offline_cmpiler(error, test.name); - if (test.func == NULL) { // Skip unimplemented test, can happen when all of the tests are @@ -1172,7 +1198,7 @@ cl_platform_id getPlatformFromDevice(cl_device_id deviceID) void PrintArch(void) { - vlog("sizeof( void*) = %ld\n", sizeof(void *)); + vlog("sizeof( void*) = %zu\n", sizeof(void *)); #if defined(__ppc__) vlog("ARCH:\tppc\n"); #elif defined(__ppc64__) diff --git a/test_common/harness/threadTesting.cpp b/test_common/harness/threadTesting.cpp deleted file mode 100644 index 875ee59b..00000000 --- a/test_common/harness/threadTesting.cpp +++ /dev/null @@ -1,98 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "compat.h" -#include "threadTesting.h" -#include "errorHelpers.h" -#include <stdio.h> -#include <string.h> - -#if !defined(_WIN32) -#include <pthread.h> -#endif - -#if 0 // Disabed for now - -typedef struct -{ - basefn mFunction; - cl_device_id mDevice; - cl_context mContext; - int mNumElements; -} TestFnArgs; - -//////////////////////////////////////////////////////////////////////////////// -// Thread-based testing. Spawns a new thread to run the given test function, -// then waits for it to complete. The entire idea is that, if the thread crashes, -// we can catch it and report it as a failure instead of crashing the entire suite -//////////////////////////////////////////////////////////////////////////////// - -void *test_thread_wrapper( void *data ) -{ - TestFnArgs *args; - int retVal; - cl_context context; - - args = (TestFnArgs *)data; - - /* Create a new context to use (contexts can't cross threads) */ - context = clCreateContext(NULL, args->mDeviceGroup); - if( context == NULL ) - { - log_error("clCreateContext failed for new thread\n"); - return (void *)(-1); - } - - /* Call function */ - retVal = args->mFunction( args->mDeviceGroup, args->mDevice, context, args->mNumElements ); - - clReleaseContext( context ); - - return (void *)retVal; -} - -int test_threaded_function( basefn fnToTest, cl_device_id device, cl_context context, cl_command_queue queue, int numElements ) -{ - int error; - pthread_t threadHdl; - void *retVal; - TestFnArgs args; - - - args.mFunction = fnToTest; - args.mDeviceGroup = deviceGroup; - args.mDevice = device; - args.mContext = context; - args.mNumElements = numElements; - - - error = pthread_create( &threadHdl, NULL, test_thread_wrapper, (void *)&args ); - if( error != 0 ) - { - log_error( "ERROR: Unable to create thread for testing!\n" ); - return -1; - } - - /* Thread has been started, now just wait for it to complete (or crash) */ - error = pthread_join( threadHdl, &retVal ); - if( error != 0 ) - { - log_error( "ERROR: Unable to join testing thread!\n" ); - return -1; - } - - return (int)((intptr_t)retVal); -} -#endif diff --git a/test_common/harness/threadTesting.h b/test_common/harness/threadTesting.h index 765eabcc..2f3c1873 100644 --- a/test_common/harness/threadTesting.h +++ b/test_common/harness/threadTesting.h @@ -24,8 +24,5 @@ typedef int (*basefn)(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_threaded_function(basefn fnToTest, cl_device_id device, - cl_context context, cl_command_queue queue, - int numElements); -#endif // _threadTesting_h +#endif // _threadTesting_h
\ No newline at end of file diff --git a/test_common/harness/typeWrappers.h b/test_common/harness/typeWrappers.h index 9a58a9d2..50c7c938 100644 --- a/test_common/harness/typeWrappers.h +++ b/test_common/harness/typeWrappers.h @@ -16,122 +16,134 @@ #ifndef _typeWrappers_h #define _typeWrappers_h -#include <stdio.h> -#include <stdlib.h> - #if !defined(_WIN32) #include <sys/mman.h> #endif #include "compat.h" -#include <stdio.h> #include "mt19937.h" #include "errorHelpers.h" #include "kernelHelpers.h" -/* cl_context wrapper */ +#include <cstdlib> +#include <type_traits> -class clContextWrapper { -public: - clContextWrapper() { mContext = NULL; } - clContextWrapper(cl_context program) { mContext = program; } - ~clContextWrapper() - { - if (mContext != NULL) clReleaseContext(mContext); - } +namespace wrapper_details { + +// clRetain*() and clRelease*() functions share the same type. +template <typename T> // T should be cl_context, cl_program, ... +using RetainReleaseType = cl_int CL_API_CALL(T); - clContextWrapper &operator=(const cl_context &rhs) +// A generic wrapper class that follows OpenCL retain/release semantics. +// +// This Wrapper class implement copy and move semantics, which makes it +// compatible with standard containers for example. +// +// Template parameters: +// - T is the cl_* type (e.g. cl_context, cl_program, ...) +// - Retain is the clRetain* function (e.g. clRetainContext, ...) +// - Release is the clRelease* function (e.g. clReleaseContext, ...) +template <typename T, RetainReleaseType<T> Retain, RetainReleaseType<T> Release> +class Wrapper { + static_assert(std::is_pointer<T>::value, "T should be a pointer type."); + T object = nullptr; + + void retain() { - mContext = rhs; - return *this; + if (!object) return; + + auto err = Retain(object); + if (err != CL_SUCCESS) + { + print_error(err, "clRetain*() failed"); + std::abort(); + } } - operator cl_context() const { return mContext; } - cl_context *operator&() { return &mContext; } + void release() + { + if (!object) return; - bool operator==(const cl_context &rhs) { return mContext == rhs; } + auto err = Release(object); + if (err != CL_SUCCESS) + { + print_error(err, "clRelease*() failed"); + std::abort(); + } + } -protected: - cl_context mContext; -}; +public: + Wrapper() = default; -/* cl_program wrapper */ + // On initialisation, assume the object has a refcount of one. + Wrapper(T object): object(object) {} -class clProgramWrapper { -public: - clProgramWrapper() { mProgram = NULL; } - clProgramWrapper(cl_program program) { mProgram = program; } - ~clProgramWrapper() + // On assignment, assume the object has a refcount of one. + Wrapper &operator=(T rhs) { - if (mProgram != NULL) clReleaseProgram(mProgram); + reset(rhs); + return *this; } - clProgramWrapper &operator=(const cl_program &rhs) + // Copy semantics, increase retain count. + Wrapper(Wrapper const &w) { *this = w; } + Wrapper &operator=(Wrapper const &w) { - mProgram = rhs; + reset(w.object); + retain(); return *this; } - operator cl_program() const { return mProgram; } - - cl_program *operator&() { return &mProgram; } - bool operator==(const cl_program &rhs) { return mProgram == rhs; } - -protected: - cl_program mProgram; -}; - -/* cl_kernel wrapper */ - -class clKernelWrapper { -public: - clKernelWrapper() { mKernel = NULL; } - clKernelWrapper(cl_kernel kernel) { mKernel = kernel; } - ~clKernelWrapper() + // Move semantics, directly take ownership. + Wrapper(Wrapper &&w) { *this = std::move(w); } + Wrapper &operator=(Wrapper &&w) { - if (mKernel != NULL) clReleaseKernel(mKernel); + reset(w.object); + w.object = nullptr; + return *this; } - clKernelWrapper &operator=(const cl_kernel &rhs) + ~Wrapper() { reset(); } + + // Release the existing object, if any, and own the new one, if any. + void reset(T new_object = nullptr) { - mKernel = rhs; - return *this; + release(); + object = new_object; } - operator cl_kernel() const { return mKernel; } - cl_kernel *operator&() { return &mKernel; } + operator T() const { return object; } - bool operator==(const cl_kernel &rhs) { return mKernel == rhs; } - -protected: - cl_kernel mKernel; + // Ideally this function should not exist as it breaks encapsulation by + // allowing external mutation of the Wrapper internal state. However, too + // much code currently relies on this. For example, instead of using T* as + // output parameters, existing code can be updated to use Wrapper& instead. + T *operator&() { return &object; } }; -/* cl_mem (stream) wrapper */ +} // namespace wrapper_details -class clMemWrapper { -public: - clMemWrapper() { mMem = NULL; } - clMemWrapper(cl_mem mem) { mMem = mem; } - ~clMemWrapper() - { - if (mMem != NULL) clReleaseMemObject(mMem); - } +using clContextWrapper = + wrapper_details::Wrapper<cl_context, clRetainContext, clReleaseContext>; - clMemWrapper &operator=(const cl_mem &rhs) - { - mMem = rhs; - return *this; - } - operator cl_mem() const { return mMem; } +using clProgramWrapper = + wrapper_details::Wrapper<cl_program, clRetainProgram, clReleaseProgram>; - cl_mem *operator&() { return &mMem; } +using clKernelWrapper = + wrapper_details::Wrapper<cl_kernel, clRetainKernel, clReleaseKernel>; - bool operator==(const cl_mem &rhs) { return mMem == rhs; } +using clMemWrapper = + wrapper_details::Wrapper<cl_mem, clRetainMemObject, clReleaseMemObject>; -protected: - cl_mem mMem; -}; +using clCommandQueueWrapper = + wrapper_details::Wrapper<cl_command_queue, clRetainCommandQueue, + clReleaseCommandQueue>; + +using clSamplerWrapper = + wrapper_details::Wrapper<cl_sampler, clRetainSampler, clReleaseSampler>; + +using clEventWrapper = + wrapper_details::Wrapper<cl_event, clRetainEvent, clReleaseEvent>; class clProtectedImage { public: @@ -183,92 +195,12 @@ public: cl_mem *operator&() { return ℑ } - bool operator==(const cl_mem &rhs) { return image == rhs; } - protected: void *backingStore; size_t backingStoreSize; cl_mem image; }; -/* cl_command_queue wrapper */ -class clCommandQueueWrapper { -public: - clCommandQueueWrapper() { mMem = NULL; } - clCommandQueueWrapper(cl_command_queue mem) { mMem = mem; } - ~clCommandQueueWrapper() - { - if (mMem != NULL) - { - clReleaseCommandQueue(mMem); - } - } - - clCommandQueueWrapper &operator=(const cl_command_queue &rhs) - { - mMem = rhs; - return *this; - } - operator cl_command_queue() const { return mMem; } - - cl_command_queue *operator&() { return &mMem; } - - bool operator==(const cl_command_queue &rhs) { return mMem == rhs; } - -protected: - cl_command_queue mMem; -}; - -/* cl_sampler wrapper */ -class clSamplerWrapper { -public: - clSamplerWrapper() { mMem = NULL; } - clSamplerWrapper(cl_sampler mem) { mMem = mem; } - ~clSamplerWrapper() - { - if (mMem != NULL) clReleaseSampler(mMem); - } - - clSamplerWrapper &operator=(const cl_sampler &rhs) - { - mMem = rhs; - return *this; - } - operator cl_sampler() const { return mMem; } - - cl_sampler *operator&() { return &mMem; } - - bool operator==(const cl_sampler &rhs) { return mMem == rhs; } - -protected: - cl_sampler mMem; -}; - -/* cl_event wrapper */ -class clEventWrapper { -public: - clEventWrapper() { mMem = NULL; } - clEventWrapper(cl_event mem) { mMem = mem; } - ~clEventWrapper() - { - if (mMem != NULL) clReleaseEvent(mMem); - } - - clEventWrapper &operator=(const cl_event &rhs) - { - mMem = rhs; - return *this; - } - operator cl_event() const { return mMem; } - - cl_event *operator&() { return &mMem; } - - bool operator==(const cl_event &rhs) { return mMem == rhs; } - -protected: - cl_event mMem; -}; - /* Generic protected memory buffer, for verifying access within bounds */ class clProtectedArray { public: diff --git a/test_conformance/CMakeLists.txt b/test_conformance/CMakeLists.txt index 363ece86..f9514f1e 100644 --- a/test_conformance/CMakeLists.txt +++ b/test_conformance/CMakeLists.txt @@ -52,6 +52,7 @@ add_subdirectory( pipes ) add_subdirectory( device_timer ) add_subdirectory( spirv_new ) add_subdirectory( spir ) +add_subdirectory( vulkan ) file(GLOB CSV_FILES "opencl_conformance_tests_*.csv") diff --git a/test_conformance/SVM/test_byte_granularity.cpp b/test_conformance/SVM/test_byte_granularity.cpp index 403528b9..6dbb3649 100644 --- a/test_conformance/SVM/test_byte_granularity.cpp +++ b/test_conformance/SVM/test_byte_granularity.cpp @@ -58,7 +58,6 @@ int test_svm_byte_granularity(cl_device_id deviceID, cl_context c, cl_command_qu cl_uint num_devices = 0; cl_int err = CL_SUCCESS; - cl_int rval = CL_SUCCESS; err = create_cl_objects(deviceID, &byte_manipulation_kernels[0], &context, &program, &queues[0], &num_devices, CL_DEVICE_SVM_FINE_GRAIN_BUFFER); if(err == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing. diff --git a/test_conformance/SVM/test_cross_buffer_pointers.cpp b/test_conformance/SVM/test_cross_buffer_pointers.cpp index c1caebb9..2baa7ad7 100644 --- a/test_conformance/SVM/test_cross_buffer_pointers.cpp +++ b/test_conformance/SVM/test_cross_buffer_pointers.cpp @@ -162,7 +162,8 @@ int test_svm_cross_buffer_pointers_coarse_grain(cl_device_id deviceID, cl_contex test_error(error, "clCreateBuffer failed."); // this buffer holds the index into the nodes buffer that is used for node allocation - clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error); + clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(size_t), NULL, &error); test_error(error, "clCreateBuffer failed."); // this buffer holds the count of correct nodes which is computed by the verify kernel. diff --git a/test_conformance/SVM/test_migrate.cpp b/test_conformance/SVM/test_migrate.cpp index 2a1ce051..f624bcd9 100644 --- a/test_conformance/SVM/test_migrate.cpp +++ b/test_conformance/SVM/test_migrate.cpp @@ -78,9 +78,6 @@ int test_svm_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue cl_uint amem[GLOBAL_SIZE]; cl_uint bmem[GLOBAL_SIZE]; cl_uint cmem[GLOBAL_SIZE]; - cl_uint ramem[GLOBAL_SIZE]; - cl_uint rbmem[GLOBAL_SIZE]; - cl_uint rcmem[GLOBAL_SIZE]; cl_event evs[20]; const size_t global_size = GLOBAL_SIZE; diff --git a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp index f26981bc..12358167 100644 --- a/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp +++ b/test_conformance/SVM/test_shared_address_space_coarse_grain.cpp @@ -98,7 +98,9 @@ cl_int create_linked_lists_on_device(int ci, cl_command_queue cmdq, cl_mem alloc cl_int error = CL_SUCCESS; log_info("SVM: creating linked list on device: %d ", ci); - size_t *pAllocator = (size_t*) clEnqueueMapBuffer(cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL,NULL, &error); + size_t *pAllocator = (size_t *)clEnqueueMapBuffer( + cmdq, allocator, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(size_t), + 0, NULL, NULL, &error); test_error2(error, pAllocator, "clEnqueueMapBuffer failed"); // reset allocator index *pAllocator = numLists; // the first numLists elements of the nodes array are already allocated (they hold the head of each list). @@ -206,7 +208,9 @@ int shared_address_space_coarse_grain(cl_device_id deviceID, cl_context context2 } // this buffer holds an index into the nodes buffer, it is used for node allocation - clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error); + clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(size_t), NULL, &error); + test_error(error, "clCreateBuffer failed."); error = clGetMemObjectInfo(allocator, CL_MEM_USES_SVM_POINTER, sizeof(cl_bool), &usesSVMpointer, 0); diff --git a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp index a98a880c..3350972e 100644 --- a/test_conformance/SVM/test_shared_address_space_fine_grain.cpp +++ b/test_conformance/SVM/test_shared_address_space_fine_grain.cpp @@ -47,7 +47,7 @@ int test_svm_shared_address_space_fine_grain(cl_device_id deviceID, cl_context c test_error2(error, pNodes, "malloc failed"); // this allocation holds an index into the nodes buffer, it is used for node allocation - size_t* pAllocator = (size_t*) align_malloc(sizeof(cl_int), 128); + size_t *pAllocator = (size_t *)align_malloc(sizeof(size_t), 128); test_error2(error, pAllocator, "malloc failed"); // this allocation holds the count of correct nodes, which is computed by the verify kernel. diff --git a/test_conformance/SVM/test_shared_sub_buffers.cpp b/test_conformance/SVM/test_shared_sub_buffers.cpp index a79484c9..2532886e 100644 --- a/test_conformance/SVM/test_shared_sub_buffers.cpp +++ b/test_conformance/SVM/test_shared_sub_buffers.cpp @@ -182,7 +182,8 @@ int test_svm_shared_sub_buffers(cl_device_id deviceID, cl_context context2, cl_c // this buffer holds the index into the nodes buffer that is used for node allocation - clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &error); + clMemWrapper allocator = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(size_t), NULL, &error); test_error(error, "clCreateBuffer failed."); // this buffer holds the count of correct nodes which is computed by the verify kernel. diff --git a/test_conformance/allocations/allocation_fill.cpp b/test_conformance/allocations/allocation_fill.cpp index a7558942..b4ea3798 100644 --- a/test_conformance/allocations/allocation_fill.cpp +++ b/test_conformance/allocations/allocation_fill.cpp @@ -200,8 +200,10 @@ int fill_image_with_data(cl_context context, cl_device_id device_id, cl_command_ result = clFinish(*queue); if (result != SUCCEEDED) { - print_error(error, "clFinish failed after successful enquing filling buffer with data."); - return result; + print_error(error, + "clFinish failed after successful enqueuing filling " + "buffer with data."); + return result; } } else { error = clEnqueueWriteImage(*queue, mem, CL_FALSE, origin, region, 0, 0, data, 0, NULL, &event); diff --git a/test_conformance/allocations/allocation_functions.cpp b/test_conformance/allocations/allocation_functions.cpp index 7182c727..827ee104 100644 --- a/test_conformance/allocations/allocation_functions.cpp +++ b/test_conformance/allocations/allocation_functions.cpp @@ -37,8 +37,8 @@ int find_good_image_size(cl_device_id device_id, size_t size_to_allocate, size_t } if (size_to_allocate == 0) { - log_error("Trying to allcoate a zero sized image.\n"); - return FAILED_ABORT; + log_error("Trying to allocate a zero sized image.\n"); + return FAILED_ABORT; } error = clGetDeviceInfo( device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( max_width ), &max_width, NULL ); diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp index 0dec4c6d..43e81277 100644 --- a/test_conformance/allocations/main.cpp +++ b/test_conformance/allocations/main.cpp @@ -112,6 +112,8 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All int number_of_mems_used; cl_ulong max_individual_allocation_size = g_max_individual_allocation_size; cl_ulong global_mem_size = g_global_mem_size ; + const bool allocate_image = + (alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING); static const char* alloc_description[] = { "buffer(s)", @@ -123,7 +125,7 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All }; // Skip image tests if we don't support images on the device - if( alloc_type > BUFFER && checkForImageSupport( device ) ) + if (allocate_image && checkForImageSupport(device)) { log_info( "Can not test image allocation because device does not support images.\n" ); return 0; @@ -132,7 +134,7 @@ int doTest( cl_device_id device, cl_context context, cl_command_queue queue, All // This section was added in order to fix a bug in the test // If CL_DEVICE_MAX_MEM_ALLOC_SIZE is much grater than CL_DEVICE_IMAGE2D_MAX_WIDTH * CL_DEVICE_IMAGE2D_MAX_HEIGHT // The test will fail in image allocations as the size requested for the allocation will be much grater than the maximum size allowed for image - if( ( alloc_type != BUFFER ) && ( alloc_type != BUFFER_NON_BLOCKING ) ) + if (allocate_image) { size_t max_width, max_height; diff --git a/test_conformance/api/negative_platform.cpp b/test_conformance/api/negative_platform.cpp index 7d9de5df..861d4748 100644 --- a/test_conformance/api/negative_platform.cpp +++ b/test_conformance/api/negative_platform.cpp @@ -42,18 +42,9 @@ int test_negative_get_platform_info(cl_device_id deviceID, cl_context context, { cl_platform_id platform = getPlatformFromDevice(deviceID); - cl_int err = - clGetPlatformInfo(reinterpret_cast<cl_platform_id>(deviceID), - CL_PLATFORM_VERSION, sizeof(char*), nullptr, nullptr); - test_failure_error_ret( - err, CL_INVALID_PLATFORM, - "clGetPlatformInfo should return CL_INVALID_PLATFORM when: \"platform " - "is not a valid platform\" using a valid object which is NOT a " - "platform", - TEST_FAIL); - constexpr cl_platform_info INVALID_PARAM_VALUE = 0; - err = clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr); + cl_int err = + clGetPlatformInfo(platform, INVALID_PARAM_VALUE, 0, nullptr, nullptr); test_failure_error_ret( err, CL_INVALID_VALUE, "clGetPlatformInfo should return CL_INVALID_VALUE when: \"param_name " diff --git a/test_conformance/api/test_api_min_max.cpp b/test_conformance/api/test_api_min_max.cpp index 9e981cd3..086008d7 100644 --- a/test_conformance/api/test_api_min_max.cpp +++ b/test_conformance/api/test_api_min_max.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -22,33 +22,32 @@ const char *sample_single_param_kernel[] = { "__kernel void sample_test(__global int *src)\n" "{\n" - " int tid = get_global_id(0);\n" + " size_t tid = get_global_id(0);\n" "\n" - "}\n" }; + "}\n" +}; -const char *sample_single_param_write_kernel[] = { - "__kernel void sample_test(__global int *src)\n" - "{\n" - " int tid = get_global_id(0);\n" - " src[tid] = tid;\n" - "\n" - "}\n" }; const char *sample_read_image_kernel_pattern[] = { - "__kernel void sample_test( __global float *result, ", " )\n" + "__kernel void sample_test( __global float *result, ", + " )\n" "{\n" - " sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n" - " int tid = get_global_id(0);\n" + " sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | " + "CLK_FILTER_NEAREST;\n" + " size_t tid = get_global_id(0);\n" " result[0] = 0.0f;\n", "\n" - "}\n" }; + "}\n" +}; const char *sample_write_image_kernel_pattern[] = { - "__kernel void sample_test( ", " )\n" + "__kernel void sample_test( ", + " )\n" "{\n" - " int tid = get_global_id(0);\n", + " size_t tid = get_global_id(0);\n", "\n" - "}\n" }; + "}\n" +}; const char *sample_large_parmam_kernel_pattern[] = { @@ -57,7 +56,8 @@ const char *sample_large_parmam_kernel_pattern[] = { "result[0] = 0;\n" "%s" "\n" - "}\n" }; + "}\n" +}; const char *sample_large_int_parmam_kernel_pattern[] = { "__kernel void sample_test(%s, __global int *result)\n" @@ -65,47 +65,55 @@ const char *sample_large_int_parmam_kernel_pattern[] = { "result[0] = 0;\n" "%s" "\n" - "}\n" }; + "}\n" +}; const char *sample_sampler_kernel_pattern[] = { - "__kernel void sample_test( read_only image2d_t src, __global int4 *dst", ", sampler_t sampler%d", ")\n" + "__kernel void sample_test( read_only image2d_t src, __global int4 *dst", + ", sampler_t sampler%d", + ")\n" "{\n" - " int tid = get_global_id(0);\n", - " dst[ 0 ] = read_imagei( src, sampler%d, (int2)( 0, 0 ) );\n", + " size_t tid = get_global_id(0);\n", + " dst[ 0 ] = read_imagei( src, sampler%d, (int2)( 0, 0 ) );\n", "\n" - "}\n" }; + "}\n" +}; const char *sample_const_arg_kernel[] = { "__kernel void sample_test(__constant int *src1, __global int *dst)\n" "{\n" - " int tid = get_global_id(0);\n" + " size_t tid = get_global_id(0);\n" "\n" " dst[tid] = src1[tid];\n" "\n" - "}\n" }; + "}\n" +}; const char *sample_local_arg_kernel[] = { - "__kernel void sample_test(__local int *src1, __global int *global_src, __global int *dst)\n" + "__kernel void sample_test(__local int *src1, __global int *global_src, " + "__global int *dst)\n" "{\n" - " int tid = get_global_id(0);\n" + " size_t tid = get_global_id(0);\n" "\n" " src1[tid] = global_src[tid];\n" " barrier(CLK_GLOBAL_MEM_FENCE);\n" " dst[tid] = src1[tid];\n" "\n" - "}\n" }; + "}\n" +}; const char *sample_const_max_arg_kernel_pattern = -"__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" dst[tid] = src1[tid];\n" -"%s" -"\n" -"}\n"; - -int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) + "__kernel void sample_test(__constant int *src1 %s, __global int *dst)\n" + "{\n" + " int tid = get_global_id(0);\n" + "\n" + " dst[tid] = src1[tid];\n" + "%s" + "\n" + "}\n"; + +int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error, retVal; unsigned int maxThreadDim, threadDim, i; @@ -118,19 +126,24 @@ int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl /* Get the max thread dimensions */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxThreadDim ), &maxThreadDim, NULL ); - test_error( error, "Unable to get max work item dimensions from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + sizeof(maxThreadDim), &maxThreadDim, NULL); + test_error(error, "Unable to get max work item dimensions from device"); - if( maxThreadDim < 3 ) + if (maxThreadDim < 3) { - log_error( "ERROR: Reported max work item dimensions is less than required! (%d)\n", maxThreadDim ); + log_error("ERROR: Reported max work item dimensions is less than " + "required! (%d)\n", + maxThreadDim); return -1; } log_info("Reported max thread dimensions of %d.\n", maxThreadDim); /* Create a kernel to test with */ - if( create_single_kernel_helper( context, &program, &kernel, 1, sample_single_param_kernel, "sample_test" ) != 0 ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + sample_single_param_kernel, "sample_test") + != 0) { return -1; } @@ -138,105 +151,122 @@ int test_min_max_thread_dimensions(cl_device_id deviceID, cl_context context, cl /* Create some I/O streams */ streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * 100, NULL, &error); - if( streams[0] == NULL ) + if (streams[0] == NULL) { log_error("ERROR: Creating test array failed!\n"); return -1; } /* Set the arguments */ - error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel arguments"); retVal = 0; /* Now try running the kernel with up to that many threads */ - for (threadDim=1; threadDim <= maxThreadDim; threadDim++) + for (threadDim = 1; threadDim <= maxThreadDim; threadDim++) { - threads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim ); - localThreads = (size_t *)malloc( sizeof( size_t ) * maxThreadDim ); - for( i = 0; i < maxThreadDim; i++ ) + threads = (size_t *)malloc(sizeof(size_t) * maxThreadDim); + localThreads = (size_t *)malloc(sizeof(size_t) * maxThreadDim); + for (i = 0; i < maxThreadDim; i++) { - threads[ i ] = 1; + threads[i] = 1; localThreads[i] = 1; } - error = clEnqueueNDRangeKernel( queue, kernel, maxThreadDim, NULL, threads, localThreads, 0, NULL, &event ); - test_error( error, "Failed clEnqueueNDRangeKernel"); + error = clEnqueueNDRangeKernel(queue, kernel, maxThreadDim, NULL, + threads, localThreads, 0, NULL, &event); + test_error(error, "Failed clEnqueueNDRangeKernel"); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error( + error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); /* All done */ - free( threads ); - free( localThreads ); + free(threads); + free(localThreads); } return retVal; } -int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_work_items_sizes(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t *deviceMaxWorkItemSize; unsigned int maxWorkItemDim; /* Get the max work item dimensions */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof( maxWorkItemDim ), &maxWorkItemDim, NULL ); - test_error( error, "Unable to get max work item dimensions from device" ); - - log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n", maxWorkItemDim); - deviceMaxWorkItemSize = (size_t*)malloc(sizeof(size_t)*maxWorkItemDim); - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxWorkItemDim, deviceMaxWorkItemSize, NULL ); - test_error( error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + sizeof(maxWorkItemDim), &maxWorkItemDim, NULL); + test_error(error, "Unable to get max work item dimensions from device"); + + log_info("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS returned %d\n", + maxWorkItemDim); + deviceMaxWorkItemSize = (size_t *)malloc(sizeof(size_t) * maxWorkItemDim); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(size_t) * maxWorkItemDim, + deviceMaxWorkItemSize, NULL); + test_error(error, "clDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_SIZES failed"); unsigned int i; int errors = 0; - for(i=0; i<maxWorkItemDim; i++) { - if (deviceMaxWorkItemSize[i]<1) { - log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i, deviceMaxWorkItemSize[i]); + for (i = 0; i < maxWorkItemDim; i++) + { + if (deviceMaxWorkItemSize[i] < 1) + { + log_error("MAX_WORK_ITEM_SIZE in dimension %d is invalid: %lu\n", i, + deviceMaxWorkItemSize[i]); errors++; - } else { - log_info("Dimension %d has max work item size %lu\n", i, deviceMaxWorkItemSize[i]); + } + else + { + log_info("Dimension %d has max work item size %lu\n", i, + deviceMaxWorkItemSize[i]); } } free(deviceMaxWorkItemSize); - if (errors) - return -1; + if (errors) return -1; return 0; } - -int test_min_max_work_group_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_work_group_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t deviceMaxThreadSize; /* Get the max thread dimensions */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( deviceMaxThreadSize ), &deviceMaxThreadSize, NULL ); - test_error( error, "Unable to get max work group size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(deviceMaxThreadSize), &deviceMaxThreadSize, + NULL); + test_error(error, "Unable to get max work group size from device"); log_info("Reported %ld max device work group size.\n", deviceMaxThreadSize); - if( deviceMaxThreadSize == 0 ) + if (deviceMaxThreadSize == 0) { - log_error( "ERROR: Max work group size is reported as zero!\n" ); + log_error("ERROR: Max work group size is reported as zero!\n"); return -1; } return 0; } -int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; unsigned int maxReadImages, i; @@ -245,48 +275,55 @@ int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_c char readArgLine[128], *programSrc; const char *readArgPattern = ", read_only image2d_t srcimg%d"; clKernelWrapper kernel; - clMemWrapper *streams, result; + clMemWrapper *streams, result; size_t threads[2]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; size_t maxParameterSize; cl_event event; cl_int event_status; - cl_float image_data[4*4]; + cl_float image_data[4 * 4]; float image_result = 0.0f; float actual_image_result; cl_uint minRequiredReadImages = gIsEmbedded ? 8 : 128; cl_device_type deviceType; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID) image_format_desc.image_channel_order = CL_RGBA; image_format_desc.image_channel_data_type = CL_FLOAT; /* Get the max read image arg count */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof( maxReadImages ), &maxReadImages, NULL ); - test_error( error, "Unable to get max read image arg count from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_READ_IMAGE_ARGS, + sizeof(maxReadImages), &maxReadImages, NULL); + test_error(error, "Unable to get max read image arg count from device"); - if( maxReadImages < minRequiredReadImages ) + if (maxReadImages < minRequiredReadImages) { - log_error( "ERROR: Reported max read image arg count is less than required! (%d)\n", maxReadImages ); + log_error("ERROR: Reported max read image arg count is less than " + "required! (%d)\n", + maxReadImages); return -1; } log_info("Reported %d max read image args.\n", maxReadImages); - error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( deviceAddressSize ), &deviceAddressSize, NULL ); - test_error( error, "Unable to query CL_DEVICE_ADDRESS_BITS for device" ); + error = + clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS, + sizeof(deviceAddressSize), &deviceAddressSize, NULL); + test_error(error, "Unable to query CL_DEVICE_ADDRESS_BITS for device"); deviceAddressSize /= 8; // convert from bits to bytes - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL ); - test_error( error, "Unable to get max parameter size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, + sizeof(maxParameterSize), &maxParameterSize, NULL); + test_error(error, "Unable to get max parameter size from device"); if (!gIsEmbedded && maxReadImages >= 128 && maxParameterSize == 1024) { - error = clGetDeviceInfo( deviceID, CL_DEVICE_TYPE, sizeof( deviceType ), &deviceType, NULL ); - test_error( error, "Unable to get device type from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_TYPE, sizeof(deviceType), + &deviceType, NULL); + test_error(error, "Unable to get device type from device"); - if(deviceType != CL_DEVICE_TYPE_CUSTOM) + if (deviceType != CL_DEVICE_TYPE_CUSTOM) { maxReadImages = 127; } @@ -295,85 +332,107 @@ int test_min_max_read_image_args(cl_device_id deviceID, cl_context context, cl_c maxParameterSize -= deviceAddressSize; // Calculate the number we can use - if (maxParameterSize/deviceAddressSize < maxReadImages) { - log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/deviceAddressSize)); - maxReadImages = (unsigned int)(maxParameterSize/deviceAddressSize); + if (maxParameterSize / deviceAddressSize < maxReadImages) + { + log_info("WARNING: Max parameter size of %d bytes limits test to %d " + "max image arguments.\n", + (int)maxParameterSize, + (int)(maxParameterSize / deviceAddressSize)); + maxReadImages = (unsigned int)(maxParameterSize / deviceAddressSize); } /* Create a program with that many read args */ - programSrc = (char *)malloc( strlen( sample_read_image_kernel_pattern[ 0 ] ) + ( strlen( readArgPattern ) + 6 ) * ( maxReadImages ) + - strlen( sample_read_image_kernel_pattern[ 1 ] ) + 1 + 40240); + programSrc = (char *)malloc(strlen(sample_read_image_kernel_pattern[0]) + + (strlen(readArgPattern) + 6) * (maxReadImages) + + strlen(sample_read_image_kernel_pattern[1]) + + 1 + 40240); - strcpy( programSrc, sample_read_image_kernel_pattern[ 0 ] ); - strcat( programSrc, "read_only image2d_t srcimg0" ); - for( i = 0; i < maxReadImages-1; i++ ) + strcpy(programSrc, sample_read_image_kernel_pattern[0]); + strcat(programSrc, "read_only image2d_t srcimg0"); + for (i = 0; i < maxReadImages - 1; i++) { - sprintf( readArgLine, readArgPattern, i+1 ); - strcat( programSrc, readArgLine ); + sprintf(readArgLine, readArgPattern, i + 1); + strcat(programSrc, readArgLine); } - strcat( programSrc, sample_read_image_kernel_pattern[ 1 ] ); - for ( i = 0; i < maxReadImages; i++) { - sprintf( readArgLine, "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n", i); - strcat( programSrc, readArgLine ); + strcat(programSrc, sample_read_image_kernel_pattern[1]); + for (i = 0; i < maxReadImages; i++) + { + sprintf( + readArgLine, + "\tresult[0] += read_imagef( srcimg%d, sampler, (int2)(0,0)).x;\n", + i); + strcat(programSrc, readArgLine); } - strcat( programSrc, sample_read_image_kernel_pattern[ 2 ] ); + strcat(programSrc, sample_read_image_kernel_pattern[2]); - error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test"); - test_error( error, "Failed to create the program and kernel."); - free( programSrc ); + error = + create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programSrc, "sample_test"); + test_error(error, "Failed to create the program and kernel."); + free(programSrc); result = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &error); - test_error( error, "clCreateBufer failed"); + test_error(error, "clCreateBufer failed"); /* Create some I/O streams */ streams = new clMemWrapper[maxReadImages + 1]; - for( i = 0; i < maxReadImages; i++ ) + for (i = 0; i < maxReadImages; i++) { - image_data[0]=i; - image_result+= image_data[0]; - streams[i] = create_image_2d( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, &image_format_desc, 4, 4, 0, image_data, &error ); - test_error( error, "Unable to allocate test image" ); + image_data[0] = i; + image_result += image_data[0]; + streams[i] = + create_image_2d(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + &image_format_desc, 4, 4, 0, image_data, &error); + test_error(error, "Unable to allocate test image"); } - error = clSetKernelArg( kernel, 0, sizeof( result ), &result ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, 0, sizeof(result), &result); + test_error(error, "Unable to set kernel arguments"); /* Set the arguments */ - for( i = 1; i < maxReadImages+1; i++ ) + for (i = 1; i < maxReadImages + 1; i++) { - error = clSetKernelArg( kernel, i, sizeof( streams[i-1] ), &streams[i-1] ); - test_error( error, "Unable to set kernel arguments" ); + error = + clSetKernelArg(kernel, i, sizeof(streams[i - 1]), &streams[i - 1]); + test_error(error, "Unable to set kernel arguments"); } /* Now try running the kernel */ threads[0] = threads[1] = 1; - error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event ); - test_error( error, "clEnqueueNDRangeKernel failed"); + error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0, + NULL, &event); + test_error(error, "clEnqueueNDRangeKernel failed"); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error(error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); - error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float), &actual_image_result, 0, NULL, NULL); + error = clEnqueueReadBuffer(queue, result, CL_TRUE, 0, sizeof(cl_float), + &actual_image_result, 0, NULL, NULL); test_error(error, "clEnqueueReadBuffer failed"); delete[] streams; - if (actual_image_result != image_result) { - log_error("Result failed to verify. Got %g, expected %g.\n", actual_image_result, image_result); + if (actual_image_result != image_result) + { + log_error("Result failed to verify. Got %g, expected %g.\n", + actual_image_result, image_result); return 1; } return 0; } -int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; unsigned int maxWriteImages, i; @@ -381,94 +440,117 @@ int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_ char writeArgLine[128], *programSrc; const char *writeArgPattern = ", write_only image2d_t dstimg%d"; clKernelWrapper kernel; - clMemWrapper *streams; + clMemWrapper *streams; size_t threads[2]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; size_t maxParameterSize; cl_event event; cl_int event_status; cl_uint minRequiredWriteImages = gIsEmbedded ? 1 : 8; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID) image_format_desc.image_channel_order = CL_RGBA; image_format_desc.image_channel_data_type = CL_UNORM_INT8; /* Get the max read image arg count */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof( maxWriteImages ), &maxWriteImages, NULL ); - test_error( error, "Unable to get max write image arg count from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, + sizeof(maxWriteImages), &maxWriteImages, NULL); + test_error(error, "Unable to get max write image arg count from device"); - if( maxWriteImages == 0 ) + if (maxWriteImages == 0) { - log_info( "WARNING: Device reports 0 for a max write image arg count (write image arguments unsupported). Skipping test (implicitly passes). This is only valid if the number of image formats is also 0.\n" ); + log_info( + "WARNING: Device reports 0 for a max write image arg count (write " + "image arguments unsupported). Skipping test (implicitly passes). " + "This is only valid if the number of image formats is also 0.\n"); return 0; } - if( maxWriteImages < minRequiredWriteImages ) + if (maxWriteImages < minRequiredWriteImages) { - log_error( "ERROR: Reported max write image arg count is less than required! (%d)\n", maxWriteImages ); + log_error("ERROR: Reported max write image arg count is less than " + "required! (%d)\n", + maxWriteImages); return -1; } log_info("Reported %d max write image args.\n", maxWriteImages); - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL ); - test_error( error, "Unable to get max parameter size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, + sizeof(maxParameterSize), &maxParameterSize, NULL); + test_error(error, "Unable to get max parameter size from device"); // Calculate the number we can use - if (maxParameterSize/sizeof(cl_mem) < maxWriteImages) { - log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem))); - maxWriteImages = (unsigned int)(maxParameterSize/sizeof(cl_mem)); + if (maxParameterSize / sizeof(cl_mem) < maxWriteImages) + { + log_info("WARNING: Max parameter size of %d bytes limits test to %d " + "max image arguments.\n", + (int)maxParameterSize, + (int)(maxParameterSize / sizeof(cl_mem))); + maxWriteImages = (unsigned int)(maxParameterSize / sizeof(cl_mem)); } /* Create a program with that many write args + 1 */ - programSrc = (char *)malloc( strlen( sample_write_image_kernel_pattern[ 0 ] ) + ( strlen( writeArgPattern ) + 6 ) * ( maxWriteImages + 1 ) + - strlen( sample_write_image_kernel_pattern[ 1 ] ) + 1 + 40240 ); + programSrc = (char *)malloc( + strlen(sample_write_image_kernel_pattern[0]) + + (strlen(writeArgPattern) + 6) * (maxWriteImages + 1) + + strlen(sample_write_image_kernel_pattern[1]) + 1 + 40240); - strcpy( programSrc, sample_write_image_kernel_pattern[ 0 ] ); - strcat( programSrc, "write_only image2d_t dstimg0" ); - for( i = 1; i < maxWriteImages; i++ ) + strcpy(programSrc, sample_write_image_kernel_pattern[0]); + strcat(programSrc, "write_only image2d_t dstimg0"); + for (i = 1; i < maxWriteImages; i++) { - sprintf( writeArgLine, writeArgPattern, i ); - strcat( programSrc, writeArgLine ); + sprintf(writeArgLine, writeArgPattern, i); + strcat(programSrc, writeArgLine); } - strcat( programSrc, sample_write_image_kernel_pattern[ 1 ] ); - for ( i = 0; i < maxWriteImages; i++) { - sprintf( writeArgLine, "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n", i); - strcat( programSrc, writeArgLine ); + strcat(programSrc, sample_write_image_kernel_pattern[1]); + for (i = 0; i < maxWriteImages; i++) + { + sprintf(writeArgLine, + "\twrite_imagef( dstimg%d, (int2)(0,0), (float4)(0,0,0,0));\n", + i); + strcat(programSrc, writeArgLine); } - strcat( programSrc, sample_write_image_kernel_pattern[ 2 ] ); + strcat(programSrc, sample_write_image_kernel_pattern[2]); - error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test"); - test_error( error, "Failed to create the program and kernel."); - free( programSrc ); + error = + create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programSrc, "sample_test"); + test_error(error, "Failed to create the program and kernel."); + free(programSrc); /* Create some I/O streams */ streams = new clMemWrapper[maxWriteImages + 1]; - for( i = 0; i < maxWriteImages; i++ ) + for (i = 0; i < maxWriteImages; i++) { - streams[i] = create_image_2d( context, CL_MEM_READ_WRITE, &image_format_desc, 16, 16, 0, NULL, &error ); - test_error( error, "Unable to allocate test image" ); + streams[i] = + create_image_2d(context, CL_MEM_READ_WRITE, &image_format_desc, 16, + 16, 0, NULL, &error); + test_error(error, "Unable to allocate test image"); } /* Set the arguments */ - for( i = 0; i < maxWriteImages; i++ ) + for (i = 0; i < maxWriteImages; i++) { - error = clSetKernelArg( kernel, i, sizeof( streams[i] ), &streams[i] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]); + test_error(error, "Unable to set kernel arguments"); } /* Now try running the kernel */ threads[0] = threads[1] = 16; - error = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, threads, NULL, 0, NULL, &event ); - test_error( error, "clEnqueueNDRangeKernel failed."); + error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, threads, NULL, 0, + NULL, &event); + test_error(error, "clEnqueueNDRangeKernel failed."); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error(error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); @@ -478,7 +560,8 @@ int test_min_max_write_image_args(cl_device_id deviceID, cl_context context, cl_ return 0; } -int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_ulong maxAllocSize, memSize, minSizeToTry; @@ -492,61 +575,89 @@ int test_min_max_mem_alloc_size(cl_device_id deviceID, cl_context context, cl_co requiredAllocSize = 128 * 1024 * 1024; /* Get the max mem alloc size */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get max mem alloc size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get max mem alloc size from device"); - error = clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( memSize ), &memSize, NULL ); - test_error( error, "Unable to get global memory size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(memSize), &memSize, NULL); + test_error(error, "Unable to get global memory size from device"); - if (memSize > (cl_ulong)SIZE_MAX) { - memSize = (cl_ulong)SIZE_MAX; + if (memSize > (cl_ulong)SIZE_MAX) + { + memSize = (cl_ulong)SIZE_MAX; } - if( maxAllocSize < requiredAllocSize) + if (maxAllocSize < requiredAllocSize) { - log_error( "ERROR: Reported max allocation size is less than required %lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n", (requiredAllocSize / 1024) / 1024, maxAllocSize, (maxAllocSize / 1024)/1024, (memSize / 1024)/1024 ); + log_error("ERROR: Reported max allocation size is less than required " + "%lldMB! (%llu or %lluMB, from a total mem size of %lldMB)\n", + (requiredAllocSize / 1024) / 1024, maxAllocSize, + (maxAllocSize / 1024) / 1024, (memSize / 1024) / 1024); return -1; } - requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024)) ? 1024 * 1024 * 1024 : memSize / 4; + requiredAllocSize = ((memSize / 4) > (1024 * 1024 * 1024)) + ? 1024 * 1024 * 1024 + : memSize / 4; if (gIsEmbedded) - requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024) ? 1 * 1024 * 1024 : requiredAllocSize; + requiredAllocSize = (requiredAllocSize < 1 * 1024 * 1024) + ? 1 * 1024 * 1024 + : requiredAllocSize; else - requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024) ? 128 * 1024 * 1024 : requiredAllocSize; + requiredAllocSize = (requiredAllocSize < 128 * 1024 * 1024) + ? 128 * 1024 * 1024 + : requiredAllocSize; - if( maxAllocSize < requiredAllocSize ) + if (maxAllocSize < requiredAllocSize) { - log_error( "ERROR: Reported max allocation size is less than required of total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n", maxAllocSize, (maxAllocSize / 1024)/1024, (requiredAllocSize / 1024)/1024 ); + log_error( + "ERROR: Reported max allocation size is less than required of " + "total memory! (%llu or %lluMB, from a total mem size of %lluMB)\n", + maxAllocSize, (maxAllocSize / 1024) / 1024, + (requiredAllocSize / 1024) / 1024); return -1; } - log_info("Reported max allocation size of %lld bytes (%gMB) and global mem size of %lld bytes (%gMB).\n", - maxAllocSize, maxAllocSize/(1024.0*1024.0), requiredAllocSize, requiredAllocSize/(1024.0*1024.0)); + log_info("Reported max allocation size of %lld bytes (%gMB) and global mem " + "size of %lld bytes (%gMB).\n", + maxAllocSize, maxAllocSize / (1024.0 * 1024.0), requiredAllocSize, + requiredAllocSize / (1024.0 * 1024.0)); - if ( memSize < maxAllocSize ) { - log_info("Global memory size is less than max allocation size, using that.\n"); + if (memSize < maxAllocSize) + { + log_info("Global memory size is less than max allocation size, using " + "that.\n"); maxAllocSize = memSize; } - minSizeToTry = maxAllocSize/16; - while (maxAllocSize > (maxAllocSize/4)) { + minSizeToTry = maxAllocSize / 16; + while (maxAllocSize > (maxAllocSize / 4)) + { - log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0)); - memHdl = clCreateBuffer( context, CL_MEM_READ_ONLY, (size_t)maxAllocSize, NULL, &error ); - if (error == CL_MEM_OBJECT_ALLOCATION_FAILURE || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY) { - log_info("\tAllocation failed at size of %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0)); + log_info("Trying to create a buffer of size of %lld bytes (%gMB).\n", + maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0)); + memHdl = clCreateBuffer(context, CL_MEM_READ_ONLY, (size_t)maxAllocSize, + NULL, &error); + if (error == CL_MEM_OBJECT_ALLOCATION_FAILURE + || error == CL_OUT_OF_RESOURCES || error == CL_OUT_OF_HOST_MEMORY) + { + log_info("\tAllocation failed at size of %lld bytes (%gMB).\n", + maxAllocSize, (double)maxAllocSize / (1024.0 * 1024.0)); maxAllocSize -= minSizeToTry; continue; } - test_error( error, "clCreateBuffer failed for maximum sized buffer."); + test_error(error, "clCreateBuffer failed for maximum sized buffer."); return 0; } - log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize, (double)maxAllocSize/(1024.0*1024.0)); + log_error("Failed to allocate even %lld bytes (%gMB).\n", maxAllocSize, + (double)maxAllocSize / (1024.0 * 1024.0)); return -1; } -int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; @@ -554,10 +665,8 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co cl_image_format image_format_desc; cl_ulong maxAllocSize; cl_uint minRequiredDimension; - size_t length; - - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID) auto version = get_device_cl_version(deviceID); if (version == Version(1, 0)) @@ -571,16 +680,20 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co /* Just get any ol format to test with */ - error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc ); - test_error( error, "Unable to obtain suitable image format to test with!" ); + error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_READ_WRITE, 0, &image_format_desc); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max 2d image width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image 2d width from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image 2d width from device"); - if( maxDimension < minRequiredDimension ) + if (maxDimension < minRequiredDimension) { - log_error( "ERROR: Reported max image 2d width is less than required! (%d)\n", (int)maxDimension ); + log_error( + "ERROR: Reported max image 2d width is less than required! (%d)\n", + (int)maxDimension); return -1; } log_info("Max reported width is %ld.\n", maxDimension); @@ -588,34 +701,42 @@ int test_min_max_image_2d_width(cl_device_id deviceID, cl_context context, cl_co /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*1*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 1 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size %d x 1 = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size %d x 1 = %gMB.\n", + (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc, + maxDimension, 1, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Image 2D creation failed for maximum width" ); + print_error(error, "Image 2D creation failed for maximum width"); return -1; } return 0; } -int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; @@ -623,9 +744,8 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c cl_image_format image_format_desc; cl_ulong maxAllocSize; cl_uint minRequiredDimension; - size_t length; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID) auto version = get_device_cl_version(deviceID); if (version == Version(1, 0)) @@ -638,16 +758,20 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c } /* Just get any ol format to test with */ - error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D, CL_MEM_READ_WRITE, 0, &image_format_desc ); - test_error( error, "Unable to obtain suitable image format to test with!" ); + error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_READ_WRITE, 0, &image_format_desc); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max 2d image width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image 2d height from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image 2d height from device"); - if( maxDimension < minRequiredDimension ) + if (maxDimension < minRequiredDimension) { - log_error( "ERROR: Reported max image 2d height is less than required! (%d)\n", (int)maxDimension ); + log_error( + "ERROR: Reported max image 2d height is less than required! (%d)\n", + (int)maxDimension); return -1; } log_info("Max reported height is %ld.\n", maxDimension); @@ -655,56 +779,67 @@ int test_min_max_image_2d_height(cl_device_id deviceID, cl_context context, cl_c /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE2D, &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*1*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 1 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size 1 x %d = %gMB.\n", + (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_2d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = create_image_2d(context, CL_MEM_READ_ONLY, &image_format_desc, + 1, maxDimension, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Image 2D creation failed for maximum height" ); + print_error(error, "Image 2D creation failed for maximum height"); return -1; } return 0; } -int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; clMemWrapper streams[1]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; cl_ulong maxAllocSize; - PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID) /* Just get any ol format to test with */ error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D, CL_MEM_READ_ONLY, 0, &image_format_desc); - test_error( error, "Unable to obtain suitable image format to test with!" ); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max 2d image width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image 3d width from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_WIDTH, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image 3d width from device"); - if( maxDimension < 2048 ) + if (maxDimension < 2048) { - log_error( "ERROR: Reported max image 3d width is less than required! (%d)\n", (int)maxDimension ); + log_error( + "ERROR: Reported max image 3d width is less than required! (%d)\n", + (int)maxDimension); return -1; } log_info("Max reported width is %ld.\n", maxDimension); @@ -712,56 +847,68 @@ int test_min_max_image_3d_width(cl_device_id deviceID, cl_context context, cl_co /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*2*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 2 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size %d x 1 x 2 = %gMB.\n", + (int)maxDimension, + (2 * (float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimension, 1, 2, 0, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc, + maxDimension, 1, 2, 0, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Image 3D creation failed for maximum width" ); + print_error(error, "Image 3D creation failed for maximum width"); return -1; } return 0; } -int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; clMemWrapper streams[1]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; cl_ulong maxAllocSize; - PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID) /* Just get any ol format to test with */ error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D, CL_MEM_READ_ONLY, 0, &image_format_desc); - test_error( error, "Unable to obtain suitable image format to test with!" ); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max 2d image width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image 3d height from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_HEIGHT, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image 3d height from device"); - if( maxDimension < 2048 ) + if (maxDimension < 2048) { - log_error( "ERROR: Reported max image 3d height is less than required! (%d)\n", (int)maxDimension ); + log_error( + "ERROR: Reported max image 3d height is less than required! (%d)\n", + (int)maxDimension); return -1; } log_info("Max reported height is %ld.\n", maxDimension); @@ -769,27 +916,35 @@ int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_c /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*2*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*2*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 2 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 2 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n", (int)maxDimension, (2*(float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size 1 x %d x 2 = %gMB.\n", + (int)maxDimension, + (2 * (float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, maxDimension, 2, 0, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc, + 1, maxDimension, 2, 0, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Image 3D creation failed for maximum height" ); + print_error(error, "Image 3D creation failed for maximum height"); return -1; } @@ -797,29 +952,33 @@ int test_min_max_image_3d_height(cl_device_id deviceID, cl_context context, cl_c } -int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; clMemWrapper streams[1]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; cl_ulong maxAllocSize; - PASSIVE_REQUIRE_3D_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_3D_IMAGE_SUPPORT(deviceID) /* Just get any ol format to test with */ error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE3D, CL_MEM_READ_ONLY, 0, &image_format_desc); - test_error( error, "Unable to obtain suitable image format to test with!" ); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max 2d image width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image 3d depth from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE3D_MAX_DEPTH, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image 3d depth from device"); - if( maxDimension < 2048 ) + if (maxDimension < 2048) { - log_error( "ERROR: Reported max image 3d depth is less than required! (%d)\n", (int)maxDimension ); + log_error( + "ERROR: Reported max image 3d depth is less than required! (%d)\n", + (int)maxDimension); return -1; } log_info("Max reported depth is %ld.\n", maxDimension); @@ -827,55 +986,67 @@ int test_min_max_image_3d_depth(cl_device_id deviceID, cl_context context, cl_co /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE3D, &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*1*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 1 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", + (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_3d( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = create_image_3d(context, CL_MEM_READ_ONLY, &image_format_desc, + 1, 1, maxDimension, 0, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Image 3D creation failed for maximum depth" ); + print_error(error, "Image 3D creation failed for maximum depth"); return -1; } return 0; } -int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimension; clMemWrapper streams[1]; - cl_image_format image_format_desc; + cl_image_format image_format_desc; cl_ulong maxAllocSize; size_t minRequiredDimension = gIsEmbedded ? 256 : 2048; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ); + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID); /* Just get any ol format to test with */ - error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE2D_ARRAY, CL_MEM_READ_WRITE, 0, &image_format_desc ); - test_error( error, "Unable to obtain suitable image format to test with!" ); + error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE2D_ARRAY, + CL_MEM_READ_WRITE, 0, &image_format_desc); + test_error(error, "Unable to obtain suitable image format to test with!"); /* Get the max image array width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, sizeof( maxDimension ), &maxDimension, NULL ); - test_error( error, "Unable to get max image array size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, + sizeof(maxDimension), &maxDimension, NULL); + test_error(error, "Unable to get max image array size from device"); - if( maxDimension < minRequiredDimension ) + if (maxDimension < minRequiredDimension) { - log_error( "ERROR: Reported max image array size is less than required! (%d)\n", (int)maxDimension ); + log_error("ERROR: Reported max image array size is less than required! " + "(%d)\n", + (int)maxDimension); return -1; } log_info("Max reported image array size is %ld.\n", maxDimension); @@ -883,96 +1054,127 @@ int test_min_max_image_array_size(cl_device_id deviceID, cl_context context, cl_ /* Verify we can use the format */ image_format_desc.image_channel_data_type = CL_UNORM_INT8; image_format_desc.image_channel_order = CL_RGBA; - if (!is_image_format_supported( context, CL_MEM_READ_ONLY, CL_MEM_OBJECT_IMAGE2D_ARRAY, &image_format_desc)) { + if (!is_image_format_supported(context, CL_MEM_READ_ONLY, + CL_MEM_OBJECT_IMAGE2D_ARRAY, + &image_format_desc)) + { log_error("CL_UNORM_INT8 CL_RGBA not supported. Can not test."); return -1; } /* Verify that we can actually allocate an image that large */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); - if ( (cl_ulong)maxDimension*1*4 > maxAllocSize ) { - log_error("Can not allocate a large enough image (min size: %lld bytes, max allowed: %lld bytes) to test.\n", - (cl_ulong)maxDimension*1*4, maxAllocSize); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); + if ((cl_ulong)maxDimension * 1 * 4 > maxAllocSize) + { + log_error("Can not allocate a large enough image (min size: %lld " + "bytes, max allowed: %lld bytes) to test.\n", + (cl_ulong)maxDimension * 1 * 4, maxAllocSize); return -1; } - log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", (int)maxDimension, ((float)maxDimension*4/1024.0/1024.0)); + log_info("Attempting to create an image of size 1 x 1 x %d = %gMB.\n", + (int)maxDimension, ((float)maxDimension * 4 / 1024.0 / 1024.0)); /* Try to allocate a very big image */ - streams[0] = create_image_2d_array( context, CL_MEM_READ_ONLY, &image_format_desc, 1, 1, maxDimension, 0, 0, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = + create_image_2d_array(context, CL_MEM_READ_ONLY, &image_format_desc, 1, + 1, maxDimension, 0, 0, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "2D Image Array creation failed for maximum array size" ); + print_error(error, + "2D Image Array creation failed for maximum array size"); return -1; } return 0; } -int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; size_t maxDimensionPixels; clMemWrapper streams[2]; - cl_image_format image_format_desc = {0}; + cl_image_format image_format_desc = { 0 }; cl_ulong maxAllocSize; size_t minRequiredDimension = gIsEmbedded ? 2048 : 65536; unsigned int i = 0; size_t pixelBytes = 0; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ); + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID); /* Get the max memory allocation size */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof ( maxAllocSize ), &maxAllocSize, NULL ); - test_error( error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE." ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE."); /* Get the max image array width */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof( maxDimensionPixels ), &maxDimensionPixels, NULL ); - test_error( error, "Unable to get max image buffer size from device" ); + error = + clGetDeviceInfo(deviceID, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, + sizeof(maxDimensionPixels), &maxDimensionPixels, NULL); + test_error(error, "Unable to get max image buffer size from device"); - if( maxDimensionPixels < minRequiredDimension ) + if (maxDimensionPixels < minRequiredDimension) { - log_error( "ERROR: Reported max image buffer size is less than required! (%d)\n", (int)maxDimensionPixels ); + log_error("ERROR: Reported max image buffer size is less than " + "required! (%d)\n", + (int)maxDimensionPixels); return -1; } - log_info("Max reported image buffer size is %ld pixels.\n", maxDimensionPixels); + log_info("Max reported image buffer size is %ld pixels.\n", + maxDimensionPixels); pixelBytes = maxAllocSize / maxDimensionPixels; - if ( pixelBytes == 0 ) + if (pixelBytes == 0) { - log_error( "Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image of maximum size!\n" ); + log_error("Value of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE is greater than " + "CL_MAX_MEM_ALLOC_SIZE so there is no way to allocate image " + "of maximum size!\n"); return -1; } error = -1; - for ( i = pixelBytes; i > 0; --i ) + for (i = pixelBytes; i > 0; --i) { - error = get_8_bit_image_format( context, CL_MEM_OBJECT_IMAGE1D, CL_MEM_READ_ONLY, i, &image_format_desc ); - if ( error == CL_SUCCESS ) + error = get_8_bit_image_format(context, CL_MEM_OBJECT_IMAGE1D, + CL_MEM_READ_ONLY, i, &image_format_desc); + if (error == CL_SUCCESS) { pixelBytes = i; break; } } - test_error( error, "Device does not support format to be used to allocate image of CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n" ); + test_error(error, + "Device does not support format to be used to allocate image of " + "CL_DEVICE_IMAGE_MAX_BUFFER_SIZE\n"); - log_info("Attempting to create an 1D image with channel order %s from buffer of size %d = %gMB.\n", - GetChannelOrderName( image_format_desc.image_channel_order ), (int)maxDimensionPixels, ((float)maxDimensionPixels*pixelBytes/1024.0/1024.0)); + log_info("Attempting to create an 1D image with channel order %s from " + "buffer of size %d = %gMB.\n", + GetChannelOrderName(image_format_desc.image_channel_order), + (int)maxDimensionPixels, + ((float)maxDimensionPixels * pixelBytes / 1024.0 / 1024.0)); /* Try to allocate a buffer */ - streams[0] = clCreateBuffer( context, CL_MEM_READ_ONLY, maxDimensionPixels*pixelBytes, NULL, &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) + streams[0] = clCreateBuffer(context, CL_MEM_READ_ONLY, + maxDimensionPixels * pixelBytes, NULL, &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) { - print_error( error, "Buffer creation failed for maximum image buffer size" ); + print_error(error, + "Buffer creation failed for maximum image buffer size"); return -1; } /* Try to allocate a 1D image array from buffer */ - streams[1] = create_image_1d( context, CL_MEM_READ_ONLY, &image_format_desc, maxDimensionPixels, 0, NULL, streams[0], &error ); - if( ( streams[0] == NULL ) || ( error != CL_SUCCESS )) - { - print_error( error, "1D Image from buffer creation failed for maximum image buffer size" ); + streams[1] = + create_image_1d(context, CL_MEM_READ_ONLY, &image_format_desc, + maxDimensionPixels, 0, NULL, streams[0], &error); + if ((streams[0] == NULL) || (error != CL_SUCCESS)) + { + print_error(error, + "1D Image from buffer creation failed for maximum image " + "buffer size"); return -1; } @@ -980,8 +1182,8 @@ int test_min_max_image_buffer_size(cl_device_id deviceID, cl_context context, cl } - -int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error, retVal, i; size_t maxSize; @@ -1000,62 +1202,78 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co /* Get the max param size */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxSize ), &maxSize, NULL ); - test_error( error, "Unable to get max parameter size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, + sizeof(maxSize), &maxSize, NULL); + test_error(error, "Unable to get max parameter size from device"); - if( ((!gIsEmbedded) && (maxSize < 1024)) || ((gIsEmbedded) && (maxSize < 256)) ) + if (((!gIsEmbedded) && (maxSize < 1024)) + || ((gIsEmbedded) && (maxSize < 256))) { - log_error( "ERROR: Reported max parameter size is less than required! (%d)\n", (int)maxSize ); + log_error( + "ERROR: Reported max parameter size is less than required! (%d)\n", + (int)maxSize); return -1; } /* The embedded profile without cles_khr_int64 extension does not require * longs, so use ints */ if (embeddedNoLong) - numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_int); + numberOfIntParametersToTry = numberExpected = + (maxSize - sizeof(cl_mem)) / sizeof(cl_int); else - numberOfIntParametersToTry = numberExpected = (maxSize-sizeof(cl_mem))/sizeof(cl_long); + numberOfIntParametersToTry = numberExpected = + (maxSize - sizeof(cl_mem)) / sizeof(cl_long); - decrement = (size_t)(numberOfIntParametersToTry/8); - if (decrement < 1) - decrement = 1; + decrement = (size_t)(numberOfIntParametersToTry / 8); + if (decrement < 1) decrement = 1; log_info("Reported max parameter size of %d bytes.\n", (int)maxSize); - while (numberOfIntParametersToTry > 0) { - // These need to be inside to be deallocated automatically on each loop iteration. + while (numberOfIntParametersToTry > 0) + { + // These need to be inside to be deallocated automatically on each loop + // iteration. clProgramWrapper program; clMemWrapper mem; clKernelWrapper kernel; if (embeddedNoLong) { - log_info("Trying a kernel with %ld int arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n", - numberOfIntParametersToTry, sizeof(cl_int)*numberOfIntParametersToTry, sizeof(cl_mem), - sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int)); + log_info( + "Trying a kernel with %ld int arguments (%ld bytes) and one " + "cl_mem (%ld bytes) for %ld bytes total.\n", + numberOfIntParametersToTry, + sizeof(cl_int) * numberOfIntParametersToTry, sizeof(cl_mem), + sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_int)); } else { - log_info("Trying a kernel with %ld long arguments (%ld bytes) and one cl_mem (%ld bytes) for %ld bytes total.\n", - numberOfIntParametersToTry, sizeof(cl_long)*numberOfIntParametersToTry, sizeof(cl_mem), - sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long)); + log_info( + "Trying a kernel with %ld long arguments (%ld bytes) and one " + "cl_mem (%ld bytes) for %ld bytes total.\n", + numberOfIntParametersToTry, + sizeof(cl_long) * numberOfIntParametersToTry, sizeof(cl_mem), + sizeof(cl_mem) + numberOfIntParametersToTry * sizeof(cl_long)); } // Allocate memory for the program storage - data = malloc(sizeof(cl_long)*numberOfIntParametersToTry); - - argumentLine = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32); - codeLines = (char*)malloc(sizeof(char)*numberOfIntParametersToTry*32); - programSrc = (char*)malloc(sizeof(char)*(numberOfIntParametersToTry*64+1024)); + data = malloc(sizeof(cl_long) * numberOfIntParametersToTry); + + argumentLine = + (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32); + codeLines = + (char *)malloc(sizeof(char) * numberOfIntParametersToTry * 32); + programSrc = (char *)malloc(sizeof(char) + * (numberOfIntParametersToTry * 64 + 1024)); argumentLine[0] = '\0'; codeLines[0] = '\0'; programSrc[0] = '\0'; // Generate our results expectedResult = 0; - for (i=0; i<(int)numberOfIntParametersToTry; i++) - { - if( gHasLong ) + for (i = 0; i < (int)numberOfIntParametersToTry; i++) + { + if (gHasLong) { ((cl_long *)data)[i] = i; expectedResult += i; @@ -1068,30 +1286,35 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co } // Build the program - if( gHasLong) + if (gHasLong) sprintf(argumentLine, "%s", "long arg0"); else sprintf(argumentLine, "%s", "int arg0"); sprintf(codeLines, "%s", "result[0] += arg0;"); - for (i=1; i<(int)numberOfIntParametersToTry; i++) + for (i = 1; i < (int)numberOfIntParametersToTry; i++) { - if( gHasLong) - sprintf(argumentLine + strlen( argumentLine), ", long arg%d", i); + if (gHasLong) + sprintf(argumentLine + strlen(argumentLine), ", long arg%d", i); else - sprintf(argumentLine + strlen( argumentLine), ", int arg%d", i); + sprintf(argumentLine + strlen(argumentLine), ", int arg%d", i); - sprintf(codeLines + strlen( codeLines), "\nresult[0] += arg%d;", i); + sprintf(codeLines + strlen(codeLines), "\nresult[0] += arg%d;", i); } /* Create a kernel to test with */ - sprintf( programSrc, gHasLong ? sample_large_parmam_kernel_pattern[0]: - sample_large_int_parmam_kernel_pattern[0], argumentLine, codeLines); + sprintf(programSrc, + gHasLong ? sample_large_parmam_kernel_pattern[0] + : sample_large_int_parmam_kernel_pattern[0], + argumentLine, codeLines); ptr = programSrc; - if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&ptr, "sample_test" ) != 0 ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&ptr, "sample_test") + != 0) { - log_info("Create program failed, decrementing number of parameters to try.\n"); + log_info("Create program failed, decrementing number of parameters " + "to try.\n"); numberOfIntParametersToTry -= decrement; continue; } @@ -1103,88 +1326,119 @@ int test_min_max_parameter_size(cl_device_id deviceID, cl_context context, cl_co &error); test_error(error, "clCreateBuffer failed"); - for (i=0; i<(int)numberOfIntParametersToTry; i++) { - if(gHasLong) - error = clSetKernelArg(kernel, i, sizeof(cl_long), &(((cl_long*)data)[i])); + for (i = 0; i < (int)numberOfIntParametersToTry; i++) + { + if (gHasLong) + error = clSetKernelArg(kernel, i, sizeof(cl_long), + &(((cl_long *)data)[i])); else - error = clSetKernelArg(kernel, i, sizeof(cl_int), &(((cl_int*)data)[i])); + error = clSetKernelArg(kernel, i, sizeof(cl_int), + &(((cl_int *)data)[i])); - if (error != CL_SUCCESS) { - log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error)); + if (error != CL_SUCCESS) + { + log_info("clSetKernelArg failed (%s), decrementing number of " + "parameters to try.\n", + IGetErrorString(error)); numberOfIntParametersToTry -= decrement; break; } } - if (error != CL_SUCCESS) - continue; + if (error != CL_SUCCESS) continue; error = clSetKernelArg(kernel, i, sizeof(cl_mem), &mem); - if (error != CL_SUCCESS) { - log_info( "clSetKernelArg failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error)); + if (error != CL_SUCCESS) + { + log_info("clSetKernelArg failed (%s), decrementing number of " + "parameters to try.\n", + IGetErrorString(error)); numberOfIntParametersToTry -= decrement; continue; } - size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1}; - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event); - if (error != CL_SUCCESS) { - log_info( "clEnqueueNDRangeKernel failed (%s), decrementing number of parameters to try.\n", IGetErrorString(error)); + size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 }; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, + localDim, 0, NULL, &event); + if (error != CL_SUCCESS) + { + log_info("clEnqueueNDRangeKernel failed (%s), decrementing number " + "of parameters to try.\n", + IGetErrorString(error)); numberOfIntParametersToTry -= decrement; continue; } // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error( + error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); - if(gHasLong) - error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long), &long_result, 0, NULL, NULL); + if (gHasLong) + error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_long), + &long_result, 0, NULL, NULL); else - error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int), &int_result, 0, NULL, NULL); + error = clEnqueueReadBuffer(queue, mem, CL_TRUE, 0, sizeof(cl_int), + &int_result, 0, NULL, NULL); test_error(error, "clEnqueueReadBuffer failed") - free(data); + free(data); free(argumentLine); free(codeLines); free(programSrc); - if(gHasLong) + if (gHasLong) { - if (long_result != expectedResult) { - log_error("Expected result (%lld) does not equal actual result (%lld).\n", expectedResult, long_result); + if (long_result != expectedResult) + { + log_error("Expected result (%lld) does not equal actual result " + "(%lld).\n", + expectedResult, long_result); numberOfIntParametersToTry -= decrement; continue; - } else { - log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_long)); + } + else + { + log_info("Results verified at %ld bytes of arguments.\n", + sizeof(cl_mem) + + numberOfIntParametersToTry * sizeof(cl_long)); break; } } else { - if (int_result != expectedResult) { - log_error("Expected result (%lld) does not equal actual result (%d).\n", expectedResult, int_result); + if (int_result != expectedResult) + { + log_error("Expected result (%lld) does not equal actual result " + "(%d).\n", + expectedResult, int_result); numberOfIntParametersToTry -= decrement; continue; - } else { - log_info("Results verified at %ld bytes of arguments.\n", sizeof(cl_mem)+numberOfIntParametersToTry*sizeof(cl_int)); + } + else + { + log_info("Results verified at %ld bytes of arguments.\n", + sizeof(cl_mem) + + numberOfIntParametersToTry * sizeof(cl_int)); break; } } } - if (numberOfIntParametersToTry == (long)numberExpected) - return 0; + if (numberOfIntParametersToTry == (long)numberExpected) return 0; return -1; } -int test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_samplers(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_uint maxSamplers, i; @@ -1197,104 +1451,124 @@ int test_min_max_samplers(cl_device_id deviceID, cl_context context, cl_command_ cl_uint minRequiredSamplers = gIsEmbedded ? 8 : 16; - PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID ) + PASSIVE_REQUIRE_IMAGE_SUPPORT(deviceID) /* Get the max value */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_SAMPLERS, sizeof( maxSamplers ), &maxSamplers, NULL ); - test_error( error, "Unable to get max sampler count from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_SAMPLERS, + sizeof(maxSamplers), &maxSamplers, NULL); + test_error(error, "Unable to get max sampler count from device"); - if( maxSamplers < minRequiredSamplers ) + if (maxSamplers < minRequiredSamplers) { - log_error( "ERROR: Reported max sampler count is less than required! (%d)\n", (int)maxSamplers ); + log_error( + "ERROR: Reported max sampler count is less than required! (%d)\n", + (int)maxSamplers); return -1; } log_info("Reported max %d samplers.\n", maxSamplers); - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL ); - test_error( error, "Unable to get max parameter size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, + sizeof(maxParameterSize), &maxParameterSize, NULL); + test_error(error, "Unable to get max parameter size from device"); // Subtract the size of the result - maxParameterSize -= 2*sizeof(cl_mem); + maxParameterSize -= 2 * sizeof(cl_mem); // Calculate the number we can use - if (maxParameterSize/sizeof(cl_sampler) < maxSamplers) { - log_info("WARNING: Max parameter size of %d bytes limits test to %d max sampler arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_sampler))); - maxSamplers = (unsigned int)(maxParameterSize/sizeof(cl_sampler)); + if (maxParameterSize / sizeof(cl_sampler) < maxSamplers) + { + log_info("WARNING: Max parameter size of %d bytes limits test to %d " + "max sampler arguments.\n", + (int)maxParameterSize, + (int)(maxParameterSize / sizeof(cl_sampler))); + maxSamplers = (unsigned int)(maxParameterSize / sizeof(cl_sampler)); } /* Create a kernel to test with */ - programSrc = (char *)malloc( ( strlen( sample_sampler_kernel_pattern[ 1 ] ) + 8 ) * ( maxSamplers ) + - strlen( sample_sampler_kernel_pattern[ 0 ] ) + strlen( sample_sampler_kernel_pattern[ 2 ] ) + - ( strlen( sample_sampler_kernel_pattern[ 3 ] ) + 8 ) * maxSamplers + - strlen( sample_sampler_kernel_pattern[ 4 ] ) ); - strcpy( programSrc, sample_sampler_kernel_pattern[ 0 ] ); - for( i = 0; i < maxSamplers; i++ ) + programSrc = (char *)malloc( + (strlen(sample_sampler_kernel_pattern[1]) + 8) * (maxSamplers) + + strlen(sample_sampler_kernel_pattern[0]) + + strlen(sample_sampler_kernel_pattern[2]) + + (strlen(sample_sampler_kernel_pattern[3]) + 8) * maxSamplers + + strlen(sample_sampler_kernel_pattern[4])); + strcpy(programSrc, sample_sampler_kernel_pattern[0]); + for (i = 0; i < maxSamplers; i++) { - sprintf( samplerLine, sample_sampler_kernel_pattern[ 1 ], i ); - strcat( programSrc, samplerLine ); + sprintf(samplerLine, sample_sampler_kernel_pattern[1], i); + strcat(programSrc, samplerLine); } - strcat( programSrc, sample_sampler_kernel_pattern[ 2 ] ); - for( i = 0; i < maxSamplers; i++ ) + strcat(programSrc, sample_sampler_kernel_pattern[2]); + for (i = 0; i < maxSamplers; i++) { - sprintf( samplerLine, sample_sampler_kernel_pattern[ 3 ], i ); - strcat( programSrc, samplerLine ); + sprintf(samplerLine, sample_sampler_kernel_pattern[3], i); + strcat(programSrc, samplerLine); } - strcat( programSrc, sample_sampler_kernel_pattern[ 4 ] ); + strcat(programSrc, sample_sampler_kernel_pattern[4]); - error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&programSrc, "sample_test"); - test_error( error, "Failed to create the program and kernel."); + error = + create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programSrc, "sample_test"); + test_error(error, "Failed to create the program and kernel."); // We have to set up some fake parameters so it'll work clSamplerWrapper *samplers = new clSamplerWrapper[maxSamplers]; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - clMemWrapper image = create_image_2d( context, CL_MEM_READ_WRITE, &format, 16, 16, 0, NULL, &error ); - test_error( error, "Unable to create a test image" ); + clMemWrapper image = create_image_2d(context, CL_MEM_READ_WRITE, &format, + 16, 16, 0, NULL, &error); + test_error(error, "Unable to create a test image"); clMemWrapper stream = clCreateBuffer(context, CL_MEM_READ_WRITE, 16, NULL, &error); - test_error( error, "Unable to create test buffer" ); + test_error(error, "Unable to create test buffer"); - error = clSetKernelArg( kernel, 0, sizeof( cl_mem ), &image ); - error |= clSetKernelArg( kernel, 1, sizeof( cl_mem ), &stream ); - test_error( error, "Unable to set kernel arguments" ); - for( i = 0; i < maxSamplers; i++ ) + error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image); + error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &stream); + test_error(error, "Unable to set kernel arguments"); + for (i = 0; i < maxSamplers; i++) { - samplers[ i ] = clCreateSampler( context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error ); - test_error( error, "Unable to create sampler" ); + samplers[i] = clCreateSampler(context, CL_FALSE, CL_ADDRESS_NONE, + CL_FILTER_NEAREST, &error); + test_error(error, "Unable to create sampler"); - error = clSetKernelArg( kernel, 2 + i, sizeof( cl_sampler ), &samplers[ i ] ); - test_error( error, "Unable to set sampler argument" ); + error = clSetKernelArg(kernel, 2 + i, sizeof(cl_sampler), &samplers[i]); + test_error(error, "Unable to set sampler argument"); } - size_t globalDim[3]={1,1,1}, localDim[3]={1,1,1}; - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, 0, NULL, &event); - test_error(error, "clEnqueueNDRangeKernel failed with maximum number of samplers."); + size_t globalDim[3] = { 1, 1, 1 }, localDim[3] = { 1, 1, 1 }; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalDim, localDim, + 0, NULL, &event); + test_error( + error, + "clEnqueueNDRangeKernel failed with maximum number of samplers."); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error(error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); - free( programSrc ); + free(programSrc); delete[] samplers; return 0; } #define PASSING_FRACTION 4 -int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; clProgramWrapper program; clKernelWrapper kernel; - size_t threads[1], localThreads[1]; + size_t threads[1], localThreads[1]; cl_int *constantData, *resultData; cl_ulong maxSize, stepSize, currentSize, maxGlobalSize, maxAllocSize; int i; @@ -1303,48 +1577,56 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, MTdata d; /* Verify our test buffer won't be bigger than allowed */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 ); - test_error( error, "Unable to get max constant buffer size" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + sizeof(maxSize), &maxSize, 0); + test_error(error, "Unable to get max constant buffer size"); - if( ( 0 == gIsEmbedded && maxSize < 64L * 1024L ) || maxSize < 1L * 1024L ) + if ((0 == gIsEmbedded && maxSize < 64L * 1024L) || maxSize < 1L * 1024L) { - log_error( "ERROR: Reported max constant buffer size less than required by OpenCL 1.0 (reported %d KB)\n", (int)( maxSize / 1024L ) ); + log_error("ERROR: Reported max constant buffer size less than required " + "by OpenCL 1.0 (reported %d KB)\n", + (int)(maxSize / 1024L)); return -1; } log_info("Reported max constant buffer size of %lld bytes.\n", maxSize); // Limit test buffer size to 1/8 of CL_DEVICE_GLOBAL_MEM_SIZE - error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(maxGlobalSize), &maxGlobalSize, 0); + error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(maxGlobalSize), &maxGlobalSize, 0); test_error(error, "Unable to get CL_DEVICE_GLOBAL_MEM_SIZE"); - if (maxSize > maxGlobalSize / 8) - maxSize = maxGlobalSize / 8; + if (maxSize > maxGlobalSize / 8) maxSize = maxGlobalSize / 8; - error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(maxAllocSize), &maxAllocSize, 0); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, 0); test_error(error, "Unable to get CL_DEVICE_MAX_MEM_ALLOC_SIZE "); - - if (maxSize > maxAllocSize) - maxSize = maxAllocSize; - + + if (maxSize > maxAllocSize) maxSize = maxAllocSize; + /* Create a kernel to test with */ - if( create_single_kernel_helper( context, &program, &kernel, 1, sample_const_arg_kernel, "sample_test" ) != 0 ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + sample_const_arg_kernel, "sample_test") + != 0) { return -1; } /* Try the returned max size and decrease it until we get one that works. */ - stepSize = maxSize/16; + stepSize = maxSize / 16; currentSize = maxSize; int allocPassed = 0; - d = init_genrand( gRandomSeed ); - while (!allocPassed && currentSize >= maxSize/PASSING_FRACTION) { - log_info("Attempting to allocate constant buffer of size %lld bytes\n", maxSize); + d = init_genrand(gRandomSeed); + while (!allocPassed && currentSize >= maxSize / PASSING_FRACTION) + { + log_info("Attempting to allocate constant buffer of size %lld bytes\n", + maxSize); /* Create some I/O streams */ - size_t sizeToAllocate = ((size_t)currentSize/sizeof( cl_int ))*sizeof(cl_int); - size_t numberOfInts = sizeToAllocate/sizeof(cl_int); - constantData = (cl_int *)malloc( sizeToAllocate); + size_t sizeToAllocate = + ((size_t)currentSize / sizeof(cl_int)) * sizeof(cl_int); + size_t numberOfInts = sizeToAllocate / sizeof(cl_int); + constantData = (cl_int *)malloc(sizeToAllocate); if (constantData == NULL) { log_error("Failed to allocate memory for constantData!\n"); @@ -1352,53 +1634,74 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, return EXIT_FAILURE; } - for(i=0; i<(int)(numberOfInts); i++) + for (i = 0; i < (int)(numberOfInts); i++) constantData[i] = (int)genrand_int32(d); clMemWrapper streams[3]; streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate, constantData, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); /* Set the arguments */ - error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0]); - test_error( error, "Unable to set indexed kernel arguments" ); - error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1]); - test_error( error, "Unable to set indexed kernel arguments" ); + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set indexed kernel arguments"); + error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set indexed kernel arguments"); /* Test running the kernel and verifying it */ threads[0] = numberOfInts; localThreads[0] = 1; - log_info("Filling constant buffer with %d cl_ints (%d bytes).\n", (int)threads[0], (int)(threads[0]*sizeof(cl_int))); - - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event ); - /* If we failed due to a resource issue, reduce the size and try again. */ - if ((error == CL_OUT_OF_RESOURCES) || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (error == CL_OUT_OF_HOST_MEMORY)) { - log_info("Kernel enqueue failed at size %lld, trying at a reduced size.\n", currentSize); + log_info("Filling constant buffer with %d cl_ints (%d bytes).\n", + (int)threads[0], (int)(threads[0] * sizeof(cl_int))); + + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event); + /* If we failed due to a resource issue, reduce the size and try again. + */ + if ((error == CL_OUT_OF_RESOURCES) + || (error == CL_MEM_OBJECT_ALLOCATION_FAILURE) + || (error == CL_OUT_OF_HOST_MEMORY)) + { + log_info("Kernel enqueue failed at size %lld, trying at a reduced " + "size.\n", + currentSize); currentSize -= stepSize; free(constantData); continue; } - test_error( error, "clEnqueueNDRangeKernel with maximum constant buffer size failed."); + test_error( + error, + "clEnqueueNDRangeKernel with maximum constant buffer size failed."); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error( + error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); - if (event_status < 0) { - if ((event_status == CL_OUT_OF_RESOURCES) || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE) || (event_status == CL_OUT_OF_HOST_MEMORY)) { - log_info("Kernel event indicates failure at size %lld, trying at a reduced size.\n", currentSize); + if (event_status < 0) + { + if ((event_status == CL_OUT_OF_RESOURCES) + || (event_status == CL_MEM_OBJECT_ALLOCATION_FAILURE) + || (event_status == CL_OUT_OF_HOST_MEMORY)) + { + log_info("Kernel event indicates failure at size %lld, trying " + "at a reduced size.\n", + currentSize); currentSize -= stepSize; free(constantData); continue; - } else { + } + else + { test_error(error, "Kernel execution event returned error"); } } @@ -1415,30 +1718,41 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, return EXIT_FAILURE; } - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL); - test_error( error, "clEnqueueReadBuffer failed"); + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, + sizeToAllocate, resultData, 0, NULL, NULL); + test_error(error, "clEnqueueReadBuffer failed"); - for(i=0; i<(int)(numberOfInts); i++) - if (constantData[i] != resultData[i]) { - log_error("Data failed to verify: constantData[%d]=%d != resultData[%d]=%d\n", + for (i = 0; i < (int)(numberOfInts); i++) + if (constantData[i] != resultData[i]) + { + log_error("Data failed to verify: constantData[%d]=%d != " + "resultData[%d]=%d\n", i, constantData[i], i, resultData[i]); - free( constantData ); + free(constantData); free(resultData); - free_mtdata(d); d = NULL; + free_mtdata(d); + d = NULL; return -1; } - free( constantData ); + free(constantData); free(resultData); } - free_mtdata(d); d = NULL; + free_mtdata(d); + d = NULL; - if (allocPassed) { - if (currentSize < maxSize/PASSING_FRACTION) { - log_error("Failed to allocate at least 1/8 of the reported constant size.\n"); + if (allocPassed) + { + if (currentSize < maxSize / PASSING_FRACTION) + { + log_error("Failed to allocate at least 1/8 of the reported " + "constant size.\n"); return -1; - } else if (currentSize != maxSize) { - log_info("Passed at reduced size. (%lld of %lld bytes)\n", currentSize, maxSize); + } + else if (currentSize != maxSize) + { + log_info("Passed at reduced size. (%lld of %lld bytes)\n", + currentSize, maxSize); return 0; } return 0; @@ -1446,13 +1760,14 @@ int test_min_max_constant_buffer_size(cl_device_id deviceID, cl_context context, return -1; } -int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_constant_args(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; clProgramWrapper program; clKernelWrapper kernel; - clMemWrapper *streams; - size_t threads[1], localThreads[1]; + clMemWrapper *streams; + size_t threads[1], localThreads[1]; cl_uint i, maxArgs; cl_ulong maxSize; cl_ulong maxParameterSize; @@ -1465,119 +1780,145 @@ int test_min_max_constant_args(cl_device_id deviceID, cl_context context, cl_com /* Verify our test buffer won't be bigger than allowed */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof( maxArgs ), &maxArgs, 0 ); - test_error( error, "Unable to get max constant arg count" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_ARGS, + sizeof(maxArgs), &maxArgs, 0); + test_error(error, "Unable to get max constant arg count"); - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof( maxParameterSize ), &maxParameterSize, NULL ); - test_error( error, "Unable to get max parameter size from device" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_PARAMETER_SIZE, + sizeof(maxParameterSize), &maxParameterSize, NULL); + test_error(error, "Unable to get max parameter size from device"); // Subtract the size of the result maxParameterSize -= sizeof(cl_mem); // Calculate the number we can use - if (maxParameterSize/sizeof(cl_mem) < maxArgs) { - log_info("WARNING: Max parameter size of %d bytes limits test to %d max image arguments.\n", (int)maxParameterSize, (int)(maxParameterSize/sizeof(cl_mem))); - maxArgs = (unsigned int)(maxParameterSize/sizeof(cl_mem)); + if (maxParameterSize / sizeof(cl_mem) < maxArgs) + { + log_info("WARNING: Max parameter size of %d bytes limits test to %d " + "max image arguments.\n", + (int)maxParameterSize, + (int)(maxParameterSize / sizeof(cl_mem))); + maxArgs = (unsigned int)(maxParameterSize / sizeof(cl_mem)); } - if( maxArgs < (gIsEmbedded ? 4 : 8) ) + if (maxArgs < (gIsEmbedded ? 4 : 8)) { - log_error( "ERROR: Reported max constant arg count less than required by OpenCL 1.0 (reported %d)\n", (int)maxArgs ); + log_error("ERROR: Reported max constant arg count less than required " + "by OpenCL 1.0 (reported %d)\n", + (int)maxArgs); return -1; } - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, 0 ); - test_error( error, "Unable to get max constant buffer size" ); - individualBufferSize = ((int)maxSize/2)/maxArgs; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, + sizeof(maxSize), &maxSize, 0); + test_error(error, "Unable to get max constant buffer size"); + individualBufferSize = (maxSize / 2) / maxArgs; - log_info("Reported max constant arg count of %d and max constant buffer size of %d. Test will attempt to allocate half of that, or %d buffers of size %d.\n", - (int)maxArgs, (int)maxSize, (int)maxArgs, (int)individualBufferSize); + log_info( + "Reported max constant arg count of %u and max constant buffer " + "size of %llu. Test will attempt to allocate half of that, or %llu " + "buffers of size %zu.\n", + maxArgs, maxSize, maxArgs, individualBufferSize); - str2 = (char*)malloc(sizeof(char)*32*(maxArgs+2)); - constArgs = (char*)malloc(sizeof(char)*32*(maxArgs+2)); - programSrc = (char*)malloc(sizeof(char)*32*2*(maxArgs+2)+1024); + str2 = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2)); + constArgs = (char *)malloc(sizeof(char) * 32 * (maxArgs + 2)); + programSrc = (char *)malloc(sizeof(char) * 32 * 2 * (maxArgs + 2) + 1024); /* Create a test program */ constArgs[0] = 0; str2[0] = 0; - for( i = 0; i < maxArgs-1; i++ ) - { - sprintf( str, ", __constant int *src%d", (int)( i + 2 ) ); - strcat( constArgs, str ); - sprintf( str2 + strlen( str2), "\tdst[tid] += src%d[tid];\n", (int)(i+2)); - if (strlen(str2) > (sizeof(char)*32*(maxArgs+2)-32) || strlen(constArgs) > (sizeof(char)*32*(maxArgs+2)-32)) { - log_info("Limiting number of arguments tested to %d due to test program allocation size.\n", i); + for (i = 0; i < maxArgs - 1; i++) + { + sprintf(str, ", __constant int *src%d", (int)(i + 2)); + strcat(constArgs, str); + sprintf(str2 + strlen(str2), "\tdst[tid] += src%d[tid];\n", + (int)(i + 2)); + if (strlen(str2) > (sizeof(char) * 32 * (maxArgs + 2) - 32) + || strlen(constArgs) > (sizeof(char) * 32 * (maxArgs + 2) - 32)) + { + log_info("Limiting number of arguments tested to %d due to test " + "program allocation size.\n", + i); break; } } - sprintf( programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2 ); + sprintf(programSrc, sample_const_max_arg_kernel_pattern, constArgs, str2); /* Create a kernel to test with */ ptr = programSrc; - if( create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "sample_test" ) != 0 ) + if (create_single_kernel_helper(context, &program, &kernel, 1, &ptr, + "sample_test") + != 0) { return -1; } /* Create some I/O streams */ - streams = new clMemWrapper[ maxArgs + 1 ]; - for( i = 0; i < maxArgs + 1; i++ ) + streams = new clMemWrapper[maxArgs + 1]; + for (i = 0; i < maxArgs + 1; i++) { streams[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, individualBufferSize, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); } /* Set the arguments */ - for( i = 0; i < maxArgs + 1; i++ ) + for (i = 0; i < maxArgs + 1; i++) { - error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]); - test_error( error, "Unable to set kernel argument" ); + error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]); + test_error(error, "Unable to set kernel argument"); } /* Test running the kernel and verifying it */ threads[0] = (size_t)10; - while (threads[0]*sizeof(cl_int) > individualBufferSize) - threads[0]--; + while (threads[0] * sizeof(cl_int) > individualBufferSize) threads[0]--; - error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); - test_error( error, "Unable to get work group size to use" ); + error = get_max_common_work_group_size(context, kernel, threads[0], + &localThreads[0]); + test_error(error, "Unable to get work group size to use"); - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &event ); - test_error( error, "clEnqueueNDRangeKernel failed"); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event); + test_error(error, "clEnqueueNDRangeKernel failed"); // Verify that the event does not return an error from the execution error = clWaitForEvents(1, &event); - test_error( error, "clWaitForEvent failed"); - error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL); - test_error( error, "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); + test_error(error, "clWaitForEvent failed"); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(event_status), &event_status, NULL); + test_error(error, + "clGetEventInfo for CL_EVENT_COMMAND_EXECUTION_STATUS failed"); clReleaseEvent(event); if (event_status < 0) test_error(error, "Kernel execution event returned error"); error = clFinish(queue); - test_error( error, "clFinish failed."); + test_error(error, "clFinish failed."); - delete [] streams; + delete[] streams; free(str2); free(constArgs); free(programSrc); return 0; } -int test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_compute_units(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_uint value; - error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get compute unit count" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(value), &value, 0); + test_error(error, "Unable to get compute unit count"); - if( value < 1 ) + if (value < 1) { - log_error( "ERROR: Reported compute unit count less than required by OpenCL 1.0 (reported %d)\n", (int)value ); + log_error("ERROR: Reported compute unit count less than required by " + "OpenCL 1.0 (reported %d)\n", + (int)value); return -1; } @@ -1586,18 +1927,22 @@ int test_min_max_compute_units(cl_device_id deviceID, cl_context context, cl_com return 0; } -int test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_address_bits(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_uint value; - error = clGetDeviceInfo( deviceID, CL_DEVICE_ADDRESS_BITS, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get address bit count" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS, sizeof(value), + &value, 0); + test_error(error, "Unable to get address bit count"); - if( value != 32 && value != 64 ) + if (value != 32 && value != 64) { - log_error( "ERROR: Reported address bit count not valid by OpenCL 1.0 (reported %d)\n", (int)value ); + log_error("ERROR: Reported address bit count not valid by OpenCL 1.0 " + "(reported %d)\n", + (int)value); return -1; } @@ -1606,68 +1951,84 @@ int test_min_max_address_bits(cl_device_id deviceID, cl_context context, cl_comm return 0; } -int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_single_fp_config(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_device_fp_config value; char profile[128] = ""; - error = clGetDeviceInfo( deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get device single fp config" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(value), + &value, 0); + test_error(error, "Unable to get device single fp config"); - //Check to see if we are an embedded profile device - if((error = clGetDeviceInfo( deviceID, CL_DEVICE_PROFILE, sizeof(profile), profile, NULL ))) + // Check to see if we are an embedded profile device + if ((error = clGetDeviceInfo(deviceID, CL_DEVICE_PROFILE, sizeof(profile), + profile, NULL))) { - log_error( "FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n", error ); + log_error("FAILURE: Unable to get CL_DEVICE_PROFILE: error %d\n", + error); return error; } - if( 0 == strcmp( profile, "EMBEDDED_PROFILE" )) + if (0 == strcmp(profile, "EMBEDDED_PROFILE")) { // embedded device - if( 0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO))) + if (0 == (value & (CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO))) { - log_error( "FAILURE: embedded device supports neither CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n" ); + log_error("FAILURE: embedded device supports neither " + "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n"); return -1; } } else { // Full profile - if( ( value & ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN )) != ( CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN ) ) + if ((value & (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN)) + != (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN)) { - log_error( "ERROR: Reported single fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value ); + log_error("ERROR: Reported single fp config doesn't meet minimum " + "set by OpenCL 1.0 (reported 0x%08x)\n", + (int)value); return -1; } } return 0; } -int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_double_fp_config(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_device_fp_config value; - error = clGetDeviceInfo( deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get device double fp config" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(value), + &value, 0); + test_error(error, "Unable to get device double fp config"); - if (value == 0) - return 0; + if (value == 0) return 0; - if( ( value & (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)) != ( CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) ) + if ((value + & (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO + | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)) + != (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO + | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)) { - log_error( "ERROR: Reported double fp config doesn't meet minimum set by OpenCL 1.0 (reported 0x%08x)\n", (int)value ); + log_error("ERROR: Reported double fp config doesn't meet minimum set " + "by OpenCL 1.0 (reported 0x%08x)\n", + (int)value); return -1; } return 0; } -int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; clProgramWrapper program; clKernelWrapper kernel; - clMemWrapper streams[3]; - size_t threads[1], localThreads[1]; + clMemWrapper streams[3]; + size_t threads[1], localThreads[1]; cl_int *localData, *resultData; cl_ulong maxSize, kernelLocalUsage, min_max_local_mem_size; Version device_version; @@ -1676,8 +2037,9 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co MTdata d; /* Verify our test buffer won't be bigger than allowed */ - error = clGetDeviceInfo( deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( maxSize ), &maxSize, 0 ); - test_error( error, "Unable to get max local buffer size" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(maxSize), + &maxSize, 0); + test_error(error, "Unable to get max local buffer size"); try { @@ -1709,65 +2071,80 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co return -1; } - log_info("Reported max local buffer size for device: %lld bytes.\n", maxSize); + log_info("Reported max local buffer size for device: %lld bytes.\n", + maxSize); /* Create a kernel to test with */ - if( create_single_kernel_helper( context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" ) != 0 ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + sample_local_arg_kernel, "sample_test") + != 0) { return -1; } - error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(kernelLocalUsage), &kernelLocalUsage, NULL); - test_error(error, "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed"); + error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(kernelLocalUsage), + &kernelLocalUsage, NULL); + test_error(error, + "clGetKernelWorkGroupInfo for CL_KERNEL_LOCAL_MEM_SIZE failed"); - log_info("Reported local buffer usage for kernel (CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n", kernelLocalUsage); + log_info("Reported local buffer usage for kernel " + "(CL_KERNEL_LOCAL_MEM_SIZE): %lld bytes.\n", + kernelLocalUsage); /* Create some I/O streams */ - size_t sizeToAllocate = ((size_t)(maxSize-kernelLocalUsage)/sizeof( cl_int ))*sizeof(cl_int); - size_t numberOfInts = sizeToAllocate/sizeof(cl_int); + size_t sizeToAllocate = + ((size_t)(maxSize - kernelLocalUsage) / sizeof(cl_int)) + * sizeof(cl_int); + size_t numberOfInts = sizeToAllocate / sizeof(cl_int); - log_info("Attempting to use %lld bytes of local memory.\n", (cl_ulong)sizeToAllocate); + log_info("Attempting to use %zu bytes of local memory.\n", sizeToAllocate); - localData = (cl_int *)malloc( sizeToAllocate ); - d = init_genrand( gRandomSeed ); - for(i=0; i<(int)(numberOfInts); i++) + localData = (cl_int *)malloc(sizeToAllocate); + d = init_genrand(gRandomSeed); + for (i = 0; i < (int)(numberOfInts); i++) localData[i] = (int)genrand_int32(d); - free_mtdata(d); d = NULL; + free_mtdata(d); + d = NULL; streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeToAllocate, localData, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeToAllocate, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); /* Set the arguments */ error = clSetKernelArg(kernel, 0, sizeToAllocate, NULL); - test_error( error, "Unable to set indexed kernel arguments" ); - error = clSetKernelArg(kernel, 1, sizeof( streams[0] ), &streams[0]); - test_error( error, "Unable to set indexed kernel arguments" ); - error = clSetKernelArg(kernel, 2, sizeof( streams[1] ), &streams[1]); - test_error( error, "Unable to set indexed kernel arguments" ); + test_error(error, "Unable to set indexed kernel arguments"); + error = clSetKernelArg(kernel, 1, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set indexed kernel arguments"); + error = clSetKernelArg(kernel, 2, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set indexed kernel arguments"); /* Test running the kernel and verifying it */ threads[0] = numberOfInts; localThreads[0] = 1; - log_info("Creating local buffer with %d cl_ints (%d bytes).\n", (int)numberOfInts, (int)sizeToAllocate); + log_info("Creating local buffer with %zu cl_ints (%zu bytes).\n", + numberOfInts, sizeToAllocate); cl_event evt; - cl_int evt_err; - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, &evt ); + cl_int evt_err; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &evt); test_error(error, "clEnqueueNDRangeKernel failed"); error = clFinish(queue); - test_error( error, "clFinish failed"); + test_error(error, "clFinish failed"); - error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof evt_err, &evt_err, NULL); - test_error( error, "clGetEventInfo with maximum local buffer size failed."); + error = clGetEventInfo(evt, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof evt_err, &evt_err, NULL); + test_error(error, "clGetEventInfo with maximum local buffer size failed."); - if (evt_err != CL_COMPLETE) { + if (evt_err != CL_COMPLETE) + { print_error(evt_err, "Kernel event returned error"); clReleaseEvent(evt); return -1; @@ -1775,95 +2152,118 @@ int test_min_max_local_mem_size(cl_device_id deviceID, cl_context context, cl_co resultData = (cl_int *)malloc(sizeToAllocate); - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, resultData, 0, NULL, NULL); - test_error( error, "clEnqueueReadBuffer failed"); + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, sizeToAllocate, + resultData, 0, NULL, NULL); + test_error(error, "clEnqueueReadBuffer failed"); - for(i=0; i<(int)(numberOfInts); i++) - if (localData[i] != resultData[i]) { + for (i = 0; i < (int)(numberOfInts); i++) + if (localData[i] != resultData[i]) + { clReleaseEvent(evt); - free( localData ); + free(localData); free(resultData); log_error("Results failed to verify.\n"); return -1; } clReleaseEvent(evt); - free( localData ); + free(localData); free(resultData); return err; } -int test_min_max_kernel_preferred_work_group_size_multiple(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_kernel_preferred_work_group_size_multiple( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { - int err; + int err; clProgramWrapper program; clKernelWrapper kernel; size_t max_local_workgroup_size[3]; size_t max_workgroup_size = 0, preferred_workgroup_size = 0; - err = create_single_kernel_helper(context, &program, &kernel, 1, sample_local_arg_kernel, "sample_test" ); + err = create_single_kernel_helper(context, &program, &kernel, 1, + sample_local_arg_kernel, "sample_test"); test_error(err, "Failed to build kernel/program."); err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, - sizeof(max_workgroup_size), &max_workgroup_size, NULL); + sizeof(max_workgroup_size), + &max_workgroup_size, NULL); test_error(err, "clGetKernelWorkgroupInfo failed."); - err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, - sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); + err = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); test_error(err, "clGetKernelWorkgroupInfo failed."); - err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL); + err = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, + sizeof(max_local_workgroup_size), + max_local_workgroup_size, NULL); test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES"); - // Since the preferred size is only a performance hint, we can only really check that we get a sane value - // back - log_info( "size: %ld preferred: %ld max: %ld\n", max_workgroup_size, preferred_workgroup_size, max_local_workgroup_size[0] ); + // Since the preferred size is only a performance hint, we can only really + // check that we get a sane value back + log_info("size: %ld preferred: %ld max: %ld\n", max_workgroup_size, + preferred_workgroup_size, max_local_workgroup_size[0]); - if( preferred_workgroup_size > max_workgroup_size ) + if (preferred_workgroup_size > max_workgroup_size) { - log_error( "ERROR: Reported preferred workgroup multiple larger than max workgroup size (preferred %ld, max %ld)\n", preferred_workgroup_size, max_workgroup_size ); + log_error("ERROR: Reported preferred workgroup multiple larger than " + "max workgroup size (preferred %ld, max %ld)\n", + preferred_workgroup_size, max_workgroup_size); return -1; } return 0; } -int test_min_max_execution_capabilities(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_execution_capabilities(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int error; cl_device_exec_capabilities value; - error = clGetDeviceInfo( deviceID, CL_DEVICE_EXECUTION_CAPABILITIES, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get execution capabilities" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_EXECUTION_CAPABILITIES, + sizeof(value), &value, 0); + test_error(error, "Unable to get execution capabilities"); - if( ( value & CL_EXEC_KERNEL ) != CL_EXEC_KERNEL ) + if ((value & CL_EXEC_KERNEL) != CL_EXEC_KERNEL) { - log_error( "ERROR: Reported execution capabilities less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value ); + log_error("ERROR: Reported execution capabilities less than required " + "by OpenCL 1.0 (reported 0x%08x)\n", + (int)value); return -1; } return 0; } -int test_min_max_queue_properties(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_queue_properties(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int error; cl_command_queue_properties value; - error = clGetDeviceInfo( deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, sizeof( value ), &value, 0 ); - test_error( error, "Unable to get queue properties" ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, + sizeof(value), &value, 0); + test_error(error, "Unable to get queue properties"); - if( ( value & CL_QUEUE_PROFILING_ENABLE ) != CL_QUEUE_PROFILING_ENABLE ) + if ((value & CL_QUEUE_PROFILING_ENABLE) != CL_QUEUE_PROFILING_ENABLE) { - log_error( "ERROR: Reported queue properties less than required by OpenCL 1.0 (reported 0x%08x)\n", (int)value ); + log_error("ERROR: Reported queue properties less than required by " + "OpenCL 1.0 (reported 0x%08x)\n", + (int)value); return -1; } return 0; } -int test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_device_version(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { // Query for the device version. Version device_cl_version = get_device_cl_version(deviceID); @@ -1959,84 +2359,101 @@ int test_min_max_device_version(cl_device_id deviceID, cl_context context, cl_co return 0; } -int test_min_max_language_version(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_min_max_language_version(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int error; - cl_char buffer[ 4098 ]; + cl_char buffer[4098]; size_t length; // Device version should fit the regex "OpenCL [0-9]+\.[0-9]+ *.*" - error = clGetDeviceInfo( deviceID, CL_DEVICE_OPENCL_C_VERSION, sizeof( buffer ), buffer, &length ); - test_error( error, "Unable to get device opencl c version string" ); - if( memcmp( buffer, "OpenCL C ", strlen( "OpenCL C " ) ) != 0 ) - { - log_error( "ERROR: Initial part of device language version string does not match required format! (returned: \"%s\")\n", (char *)buffer ); + error = clGetDeviceInfo(deviceID, CL_DEVICE_OPENCL_C_VERSION, + sizeof(buffer), buffer, &length); + test_error(error, "Unable to get device opencl c version string"); + if (memcmp(buffer, "OpenCL C ", strlen("OpenCL C ")) != 0) + { + log_error("ERROR: Initial part of device language version string does " + "not match required format! (returned: \"%s\")\n", + (char *)buffer); return -1; } log_info("Returned version \"%s\".\n", buffer); - char *p1 = (char *)buffer + strlen( "OpenCL C " ); - while( *p1 == ' ' ) - p1++; + char *p1 = (char *)buffer + strlen("OpenCL C "); + while (*p1 == ' ') p1++; char *p2 = p1; - if( ! isdigit(*p2) ) + if (!isdigit(*p2)) { - log_error( "ERROR: Major revision number must follow space behind OpenCL C! (returned %s)\n", (char*) buffer ); + log_error("ERROR: Major revision number must follow space behind " + "OpenCL C! (returned %s)\n", + (char *)buffer); return -1; } - while( isdigit( *p2 ) ) - p2++; - if( *p2 != '.' ) + while (isdigit(*p2)) p2++; + if (*p2 != '.') { - log_error( "ERROR: Version number must contain a decimal point! (returned: %s)\n", (char *)buffer ); + log_error("ERROR: Version number must contain a decimal point! " + "(returned: %s)\n", + (char *)buffer); return -1; } char *p3 = p2 + 1; - if( ! isdigit(*p3) ) + if (!isdigit(*p3)) { - log_error( "ERROR: Minor revision number is missing or does not abut the decimal point! (returned %s)\n", (char*) buffer ); + log_error("ERROR: Minor revision number is missing or does not abut " + "the decimal point! (returned %s)\n", + (char *)buffer); return -1; } - while( isdigit( *p3 ) ) - p3++; - if( *p3 != ' ' ) + while (isdigit(*p3)) p3++; + if (*p3 != ' ') { - log_error( "ERROR: A space must appear after the minor version! (returned: %s)\n", (char *)buffer ); + log_error("ERROR: A space must appear after the minor version! " + "(returned: %s)\n", + (char *)buffer); return -1; } *p2 = ' '; // Put in a space for atoi below. p2++; - int major = atoi( p1 ); - int minor = atoi( p2 ); + int major = atoi(p1); + int minor = atoi(p2); int minor_revision = 2; - if( major * 10 + minor < 10 + minor_revision ) + if (major * 10 + minor < 10 + minor_revision) { - // If the language version did not match, check to see if OPENCL_1_0_DEVICE is set. - if( getenv("OPENCL_1_0_DEVICE")) + // If the language version did not match, check to see if + // OPENCL_1_0_DEVICE is set. + if (getenv("OPENCL_1_0_DEVICE")) { - log_info( "WARNING: This test was run with OPENCL_1_0_DEVICE defined! This is not a OpenCL 1.1 or OpenCL 1.2 compatible device!!!\n" ); + log_info("WARNING: This test was run with OPENCL_1_0_DEVICE " + "defined! This is not a OpenCL 1.1 or OpenCL 1.2 " + "compatible device!!!\n"); } - else if( getenv("OPENCL_1_1_DEVICE")) + else if (getenv("OPENCL_1_1_DEVICE")) { - log_info( "WARNING: This test was run with OPENCL_1_1_DEVICE defined! This is not a OpenCL 1.2 compatible device!!!\n" ); + log_info( + "WARNING: This test was run with OPENCL_1_1_DEVICE defined! " + "This is not a OpenCL 1.2 compatible device!!!\n"); } else { - log_error( "ERROR: OpenCL device language version returned is less than 1.%d! (Returned: %s)\n", minor_revision, (char *)buffer ); - return -1; + log_error("ERROR: OpenCL device language version returned is less " + "than 1.%d! (Returned: %s)\n", + minor_revision, (char *)buffer); + return -1; } } // Sanity checks on the returned values - if( length != (strlen( (char *)buffer ) + 1 )) + if (length != (strlen((char *)buffer) + 1)) { - log_error( "ERROR: Returned length of version string does not match actual length (actual: %d, returned: %d)\n", (int)strlen( (char *)buffer ), (int)length ); + log_error("ERROR: Returned length of version string does not match " + "actual length (actual: %d, returned: %d)\n", + (int)strlen((char *)buffer), (int)length); return -1; } return 0; } - diff --git a/test_conformance/api/test_context_destructor_callback.cpp b/test_conformance/api/test_context_destructor_callback.cpp index 1d73a3c4..d29d9039 100644 --- a/test_conformance/api/test_context_destructor_callback.cpp +++ b/test_conformance/api/test_context_destructor_callback.cpp @@ -52,12 +52,7 @@ int test_context_destructor_callback(cl_device_id deviceID, cl_context context, test_error(error, "Unable to set destructor callback"); // Now release the context, which SHOULD call the callbacks - error = clReleaseContext(localContext); - test_error(error, "Unable to release local context"); - - // Note: since we manually released the context, we need to set it to NULL - // to prevent a double-release - localContext = NULL; + localContext.reset(); // At this point, all three callbacks should have already been called int numErrors = 0; diff --git a/test_conformance/api/test_kernel_arg_info.cpp b/test_conformance/api/test_kernel_arg_info.cpp index 8073e0de..d0681dfd 100644 --- a/test_conformance/api/test_kernel_arg_info.cpp +++ b/test_conformance/api/test_kernel_arg_info.cpp @@ -22,11 +22,8 @@ #define MINIMUM_OPENCL_PIPE_VERSION Version(2, 0) -static constexpr size_t CL_VERSION_LENGTH = 128; static constexpr size_t KERNEL_ARGUMENT_LENGTH = 128; static constexpr char KERNEL_ARGUMENT_NAME[] = "argument"; -static constexpr size_t KERNEL_ARGUMENT_NAME_LENGTH = - sizeof(KERNEL_ARGUMENT_NAME) + 1; static constexpr int SINGLE_KERNEL_ARG_NUMBER = 0; static constexpr int MAX_NUMBER_OF_KERNEL_ARGS = 128; @@ -167,7 +164,8 @@ static std::string generate_argument(const KernelArgInfo& kernel_arg) /* This function generates a kernel source and allows for multiple arguments to * be passed in and subsequently queried. */ static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args, - const bool supports_3d_image_writes = false) + const bool supports_3d_image_writes = false, + const bool kernel_uses_half_type = false) { std::string ret; @@ -175,10 +173,13 @@ static std::string generate_kernel(const std::vector<KernelArgInfo>& all_args, { ret += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable\n"; } + if (kernel_uses_half_type) + { + ret += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + } ret += "kernel void get_kernel_arg_info(\n"; for (int i = 0; i < all_args.size(); ++i) { - const KernelArgInfo& arg = all_args[i]; ret += generate_argument(all_args[i]); if (i == all_args.size() - 1) { @@ -537,6 +538,7 @@ size_t get_param_size(const std::string& arg_type, cl_device_id deviceID, cl_int err = clGetDeviceInfo(deviceID, CL_DEVICE_ADDRESS_BITS, sizeof(device_address_bits), &device_address_bits, NULL); + test_error_ret(err, "clGetDeviceInfo", 0); return (device_address_bits / 8); } @@ -673,8 +675,8 @@ static int run_scalar_vector_tests(cl_context context, cl_device_id deviceID) if (param_size + total_param_size >= max_param_size || all_args.size() == MAX_NUMBER_OF_KERNEL_ARGS) { - const std::string kernel_src = - generate_kernel(all_args); + const std::string kernel_src = generate_kernel( + all_args, false, device_supports_half(deviceID)); failed_tests += compare_kernel_with_expected( context, deviceID, kernel_src.c_str(), expected_args); @@ -696,7 +698,8 @@ static int run_scalar_vector_tests(cl_context context, cl_device_id deviceID) } } } - const std::string kernel_src = generate_kernel(all_args); + const std::string kernel_src = + generate_kernel(all_args, false, device_supports_half(deviceID)); failed_tests += compare_kernel_with_expected( context, deviceID, kernel_src.c_str(), expected_args); return failed_tests; @@ -808,8 +811,34 @@ static int run_image_tests(cl_context context, cl_device_id deviceID) cl_kernel_arg_address_qualifier address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL; + Version version = get_device_cl_version(deviceID); + bool supports_read_write_images = false; + if (version >= Version(3, 0)) + { + cl_uint maxReadWriteImageArgs = 0; + cl_int error = clGetDeviceInfo( + deviceID, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, + sizeof(maxReadWriteImageArgs), &maxReadWriteImageArgs, NULL); + test_error(error, + "Unable to query " + "CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS"); + + // read-write images are supported if MAX_READ_WRITE_IMAGE_ARGS is + // nonzero + supports_read_write_images = maxReadWriteImageArgs != 0; + } + else if (version >= Version(2, 0)) + { + // read-write images are required for OpenCL 2.x + supports_read_write_images = true; + } + for (auto access_qualifier : access_qualifiers) { + if (access_qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE + && !supports_read_write_images) + continue; + bool is_write = (access_qualifier == CL_KERNEL_ARG_ACCESS_WRITE_ONLY || access_qualifier == CL_KERNEL_ARG_ACCESS_READ_WRITE); diff --git a/test_conformance/api/test_kernel_attributes.cpp b/test_conformance/api/test_kernel_attributes.cpp index 2e4e0a7f..ad4baa0f 100644 --- a/test_conformance/api/test_kernel_attributes.cpp +++ b/test_conformance/api/test_kernel_attributes.cpp @@ -275,16 +275,16 @@ static bool run_test(cl_context context, cl_device_id deviceID, clKernelWrapper kernel; cl_int err = create_single_kernel_helper(context, &program, &kernel, 1, &kernel_src, "test_kernel"); - test_error(err, "create_single_kernel_helper"); + test_error_ret(err, "create_single_kernel_helper", false); // Get the size of the kernel attribute string returned size_t size = 0; err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, 0, nullptr, &size); - test_error(err, "clGetKernelInfo"); + test_error_ret(err, "clGetKernelInfo", false); std::vector<char> attributes(size); err = clGetKernelInfo(kernel, CL_KERNEL_ATTRIBUTES, attributes.size(), attributes.data(), nullptr); - test_error(err, "clGetKernelInfo"); + test_error_ret(err, "clGetKernelInfo", false); std::string attribute_string(attributes.data()); attribute_string.erase( std::remove(attribute_string.begin(), attribute_string.end(), ' '), diff --git a/test_conformance/api/test_mem_object_info.cpp b/test_conformance/api/test_mem_object_info.cpp index ccfeaafa..8dc8f6cf 100644 --- a/test_conformance/api/test_mem_object_info.cpp +++ b/test_conformance/api/test_mem_object_info.cpp @@ -348,14 +348,7 @@ int test_get_buffer_info( cl_device_id deviceID, cl_context context, cl_command_ TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_ASSOCIATED_MEMOBJECT, origObj, (cl_mem)bufferObject, "associated mem object", "%p", void * ) TEST_MEM_OBJECT_PARAM( subBufferObject, CL_MEM_OFFSET, offset, (size_t)( addressAlign ), "offset", "%ld", size_t ) - - clReleaseMemObject( subBufferObject ); - subBufferObject = NULL; - } - - clReleaseMemObject( bufferObject ); - bufferObject = NULL; } return CL_SUCCESS; @@ -370,8 +363,6 @@ int test_get_imageObject_info( cl_mem * image, cl_mem_flags objectFlags, cl_imag cl_mem_flags flags; cl_uint mapCount; cl_uint refCount; - size_t rowPitchMultiplier; - size_t slicePitchMultiplier; cl_context otherCtx; size_t offset; size_t sz; diff --git a/test_conformance/api/test_mem_objects.cpp b/test_conformance/api/test_mem_objects.cpp index c29613f9..f1a4e993 100644 --- a/test_conformance/api/test_mem_objects.cpp +++ b/test_conformance/api/test_mem_objects.cpp @@ -48,12 +48,7 @@ int test_mem_object_destructor_callback_single(clMemWrapper &memObject) test_error(error, "Unable to set destructor callback"); // Now release the buffer, which SHOULD call the callbacks - error = clReleaseMemObject(memObject); - test_error(error, "Unable to release test buffer"); - - // Note: since we manually released the mem wrapper, we need to set it to - // NULL to prevent a double-release - memObject = NULL; + memObject.reset(); // At this point, all three callbacks should have already been called int numErrors = 0; diff --git a/test_conformance/api/test_null_buffer_arg.cpp b/test_conformance/api/test_null_buffer_arg.cpp index d412d4ea..75bdd479 100644 --- a/test_conformance/api/test_null_buffer_arg.cpp +++ b/test_conformance/api/test_null_buffer_arg.cpp @@ -149,7 +149,6 @@ int test_null_buffer_arg(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { unsigned int test_success = 0; - unsigned int i; unsigned int buffer_size; cl_int status; cl_program program; diff --git a/test_conformance/api/test_queries.cpp b/test_conformance/api/test_queries.cpp index 469a1934..a7703a76 100644 --- a/test_conformance/api/test_queries.cpp +++ b/test_conformance/api/test_queries.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -19,6 +19,7 @@ #include <stdlib.h> #include <ctype.h> #include <algorithm> +#include <vector> int test_get_platform_info(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) { @@ -345,87 +346,100 @@ int command_queue_param_test(cl_command_queue queue, return 0; } -#define MIN_NUM_COMMAND_QUEUE_PROPERTIES 2 -#define OOO_NUM_COMMAND_QUEUE_PROPERTIES 4 -static cl_command_queue_properties property_options[] = { - 0, - - CL_QUEUE_PROFILING_ENABLE, - - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - - CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - - CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - - CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE - | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - - CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT - | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, - - CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT - | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE -}; - int check_get_command_queue_info_params(cl_device_id deviceID, cl_context context, bool is_compatibility) { - int error; - size_t size; + const cl_command_queue_properties host_optional[] = { + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE + }; + + const cl_command_queue_properties device_required[] = { + CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE + | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT + | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, + CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE + | CL_QUEUE_ON_DEVICE_DEFAULT + | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE + }; + + const size_t host_optional_size = ARRAY_SIZE(host_optional); + const size_t device_required_size = ARRAY_SIZE(device_required); + + Version version = get_device_cl_version(deviceID); + + const cl_device_info host_queue_query = version >= Version(2, 0) + ? CL_DEVICE_QUEUE_ON_HOST_PROPERTIES + : CL_DEVICE_QUEUE_PROPERTIES; - cl_queue_properties host_queue_props, device_queue_props; - cl_queue_properties queue_props[] = { CL_QUEUE_PROPERTIES, 0, 0 }; + cl_queue_properties host_queue_props = 0; + int error = + clGetDeviceInfo(deviceID, host_queue_query, sizeof(host_queue_props), + &host_queue_props, NULL); + test_error(error, "clGetDeviceInfo failed"); + log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", host_queue_props); - clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, - sizeof(host_queue_props), &host_queue_props, NULL); - log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", - (int)host_queue_props); - clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, - sizeof(device_queue_props), &device_queue_props, NULL); - log_info("CL_DEVICE_QUEUE_ON_HOST_PROPERTIES is %d\n", - (int)device_queue_props); + cl_queue_properties device_queue_props = 0; + if (version >= Version(2, 0)) + { + error = clGetDeviceInfo(deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, + sizeof(device_queue_props), &device_queue_props, + NULL); + test_error(error, "clGetDeviceInfo failed"); + log_info("CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES is %d\n", + device_queue_props); + } - auto version = get_device_cl_version(deviceID); + bool out_of_order_supported = + host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - // Are on device queues supported bool on_device_supported = (version >= Version(2, 0) && version < Version(3, 0)) || (version >= Version(3, 0) && device_queue_props != 0); - int num_test_options = MIN_NUM_COMMAND_QUEUE_PROPERTIES; - if (host_queue_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) + // test device queues if the device and the API under test support it + bool test_on_device = on_device_supported && !is_compatibility; + + std::vector<cl_queue_properties> queue_props{ 0, + CL_QUEUE_PROFILING_ENABLE }; + + if (out_of_order_supported) { - // Test out-of-order queues properties if supported - num_test_options = OOO_NUM_COMMAND_QUEUE_PROPERTIES; - } - if (on_device_supported && !is_compatibility) + queue_props.insert(queue_props.end(), &host_optional[0], + &host_optional[host_optional_size]); + }; + + cl_queue_properties queue_props_arg[] = { CL_QUEUE_PROPERTIES, 0, 0 }; + + if (test_on_device) { - // Test queue on device if supported (in this case out-of-order must - // also be supported) - num_test_options = ARRAY_SIZE(property_options); - } + queue_props.insert(queue_props.end(), &device_required[0], + &device_required[device_required_size]); + }; - for (int i = 0; i < num_test_options; i++) + for (cl_queue_properties props : queue_props) { - queue_props[1] = property_options[i]; - clCommandQueueWrapper queue; + queue_props_arg[1] = props; + + clCommandQueueWrapper queue; if (is_compatibility) { - queue = - clCreateCommandQueue(context, deviceID, queue_props[1], &error); + queue = clCreateCommandQueue(context, deviceID, props, &error); test_error(error, "Unable to create command queue to test with"); } else { queue = clCreateCommandQueueWithProperties(context, deviceID, - &queue_props[0], &error); + queue_props_arg, &error); test_error(error, "Unable to create command queue to test with"); } cl_uint refCount; + size_t size; error = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT, sizeof(refCount), &refCount, &size); test_error(error, "Unable to get command queue reference count"); @@ -442,11 +456,12 @@ int check_get_command_queue_info_params(cl_device_id deviceID, test_error(error, "param checking failed"); error = command_queue_param_test(queue, CL_QUEUE_PROPERTIES, - queue_props[1], "properties"); + queue_props_arg[1], "properties"); test_error(error, "param checking failed"); } return 0; } + int test_get_command_queue_info(cl_device_id deviceID, cl_context context, cl_command_queue ignoreQueue, int num_elements) { @@ -511,26 +526,6 @@ void CL_CALLBACK mem_obj_destructor_callback( cl_mem, void *data ) free( data ); } -// All possible combinations of valid cl_mem_flags. -static cl_mem_flags all_flags[16] = { - 0, - CL_MEM_READ_WRITE, - CL_MEM_READ_ONLY, - CL_MEM_WRITE_ONLY, - CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, - CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, - CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, - CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, - CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, - CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR, - CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, - CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, - CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, -}; - #define TEST_DEVICE_PARAM( device, paramName, val, name, type, cast ) \ error = clGetDeviceInfo( device, paramName, sizeof( val ), &val, &size ); \ test_error( error, "Unable to get device " name ); \ @@ -824,5 +819,3 @@ int test_kernel_required_group_size(cl_device_id deviceID, cl_context context, c return 0; } - - diff --git a/test_conformance/api/test_sub_group_dispatch.cpp b/test_conformance/api/test_sub_group_dispatch.cpp index 01d0ffa3..61d9a524 100644 --- a/test_conformance/api/test_sub_group_dispatch.cpp +++ b/test_conformance/api/test_sub_group_dispatch.cpp @@ -56,11 +56,9 @@ cl_int get_sub_group_num(cl_command_queue queue, cl_kernel kernel, clMemWrapper& int test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) { - static const size_t gsize0 = 80; - int i, error; + int error; size_t realSize; size_t kernel_max_subgroup_size, kernel_subgroup_count; - size_t global[] = {1,1,1}; size_t max_local; cl_platform_id platform; diff --git a/test_conformance/atomics/main.cpp b/test_conformance/atomics/main.cpp index afdea376..987d6bfa 100644 --- a/test_conformance/atomics/main.cpp +++ b/test_conformance/atomics/main.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -24,6 +24,7 @@ #include <unistd.h> #endif +// clang-format off test_definition test_list[] = { ADD_TEST( atomic_add ), ADD_TEST( atomic_sub ), @@ -40,11 +41,11 @@ test_definition test_list[] = { ADD_TEST( atomic_add_index ), ADD_TEST( atomic_add_index_bin ), }; +// clang-format on -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); int main(int argc, const char *argv[]) { return runTestHarness(argc, argv, test_num, test_list, false, 0); } - diff --git a/test_conformance/atomics/procs.h b/test_conformance/atomics/procs.h index bf053f25..fa85aad5 100644 --- a/test_conformance/atomics/procs.h +++ b/test_conformance/atomics/procs.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -18,22 +18,35 @@ #include "harness/threadTesting.h" #include "harness/typeWrappers.h" -extern int create_program_and_kernel(const char *source, const char *kernel_name, cl_program *program_ret, cl_kernel *kernel_ret); - -extern int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); - -extern int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); - +extern int create_program_and_kernel(const char *source, + const char *kernel_name, + cl_program *program_ret, + cl_kernel *kernel_ret); +extern int test_atomic_add(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_sub(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_xchg(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_min(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_max(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_inc(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_dec(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_and(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_or(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_xor(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_add_index(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); diff --git a/test_conformance/atomics/testBase.h b/test_conformance/atomics/testBase.h index ba67d140..22bce1d2 100644 --- a/test_conformance/atomics/testBase.h +++ b/test_conformance/atomics/testBase.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -26,6 +26,3 @@ #include "procs.h" #endif // _testBase_h - - - diff --git a/test_conformance/atomics/test_atomics.cpp b/test_conformance/atomics/test_atomics.cpp index 34b34ed3..caa4b78f 100644 --- a/test_conformance/atomics/test_atomics.cpp +++ b/test_conformance/atomics/test_atomics.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -19,10 +19,12 @@ #include <unistd.h> #endif +#include <cinttypes> + #define INT_TEST_VALUE 402258822 #define LONG_TEST_VALUE 515154531254381446LL - +// clang-format off const char *atomic_global_pattern[] = { "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n" "{\n" @@ -36,19 +38,20 @@ const char *atomic_local_pattern[] = { "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n" "{\n" " int tid = get_global_id(0);\n" - " int dstItemIdx;\n" + " int dstItemIdx;\n" "\n" " // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n" - " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n" + " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n" " destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n" " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" , " barrier( CLK_LOCAL_MEM_FENCE );\n" " // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n" - " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n" + " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n" " finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n" "}\n" }; +// clang-format on #define TEST_COUNT 128 * 1024 @@ -56,41 +59,48 @@ const char *atomic_local_pattern[] = { struct TestFns { - cl_int mIntStartValue; - cl_long mLongStartValue; + cl_int mIntStartValue; + cl_long mLongStartValue; - size_t (*NumResultsFn)( size_t threadSize, ExplicitType dataType ); + size_t (*NumResultsFn)(size_t threadSize, ExplicitType dataType); // Integer versions - cl_int (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue ); - void (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d ); - bool (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue ); + cl_int (*ExpectedValueIntFn)(size_t size, cl_int *startRefValues, + size_t whichDestValue); + void (*GenerateRefsIntFn)(size_t size, cl_int *startRefValues, MTdata d); + bool (*VerifyRefsIntFn)(size_t size, cl_int *refValues, cl_int finalValue); // Long versions - cl_long (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue ); - void (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d ); - bool (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue ); + cl_long (*ExpectedValueLongFn)(size_t size, cl_long *startRefValues, + size_t whichDestValue); + void (*GenerateRefsLongFn)(size_t size, cl_long *startRefValues, MTdata d); + bool (*VerifyRefsLongFn)(size_t size, cl_long *refValues, + cl_long finalValue); // Float versions - cl_float (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue ); - void (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d ); - bool (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue ); + cl_float (*ExpectedValueFloatFn)(size_t size, cl_float *startRefValues, + size_t whichDestValue); + void (*GenerateRefsFloatFn)(size_t size, cl_float *startRefValues, + MTdata d); + bool (*VerifyRefsFloatFn)(size_t size, cl_float *refValues, + cl_float finalValue); }; -bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType ) +bool check_atomic_support(cl_device_id device, bool extended, bool isLocal, + ExplicitType dataType) { + // clang-format off const char *extensionNames[8] = { "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics", "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics", "cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics", "cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics" // this line intended to be the same as the last one }; + // clang-format on size_t index = 0; - if( extended ) - index += 1; - if( isLocal ) - index += 2; + if (extended) index += 1; + if (isLocal) index += 2; Version version = get_device_cl_version(device); @@ -98,26 +108,28 @@ bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, Exp { case kInt: case kUInt: - if( version >= Version(1,1) ) - return 1; + if (version >= Version(1, 1)) return 1; break; case kLong: - case kULong: - index += 4; - break; - case kFloat: // this has to stay separate since the float atomics arent in the 1.0 extensions - return version >= Version(1,1); + case kULong: index += 4; break; + case kFloat: // this has to stay separate since the float atomics arent + // in the 1.0 extensions + return version >= Version(1, 1); default: - log_error( "ERROR: Unsupported data type (%d) in check_atomic_support\n", dataType ); + log_error( + "ERROR: Unsupported data type (%d) in check_atomic_support\n", + dataType); return 0; } - return is_extension_available( device, extensionNames[index] ); + return is_extension_available(device, extensionNames[index]); } -int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore, - TestFns testFns, - bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize ) +int test_atomic_function(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + const char *programCore, TestFns testFns, + bool extended, bool isLocal, ExplicitType dataType, + bool matchGroupSize) { clProgramWrapper program; clKernelWrapper kernel; @@ -127,55 +139,65 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q void *refValues, *startRefValues; size_t threadSize, groupSize; const char *programLines[4]; - char pragma[ 512 ]; - char programHeader[ 512 ]; + char pragma[512]; + char programHeader[512]; MTdata d; - size_t typeSize = get_explicit_type_size( dataType ); + size_t typeSize = get_explicit_type_size(dataType); // Verify we can run first - bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt ); - if( !check_atomic_support( deviceID, extended, isLocal, dataType ) ) + bool isUnsigned = (dataType == kULong) || (dataType == kUInt); + if (!check_atomic_support(deviceID, extended, isLocal, dataType)) { - // Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either) - if( dataType == kFloat ) - log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" ); - else if( !isUnsigned ) - log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 ); + // Only print for the signed (unsigned comes right after, and if signed + // isn't supported, unsigned isn't either) + if (dataType == kFloat) + log_info("\t%s float not supported\n", + isLocal ? "Local" : "Global"); + else if (!isUnsigned) + log_info("\t%s %sint%d not supported\n", + isLocal ? "Local" : "Global", isUnsigned ? "u" : "", + (int)typeSize * 8); // Since we don't support the operation, they implicitly pass return 0; } else { - if( dataType == kFloat ) - log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" ); + if (dataType == kFloat) + log_info("\t%s float%s...", isLocal ? "local" : "global", + isLocal ? " " : ""); else - log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "", - (int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" ); + log_info("\t%s %sint%d%s%s...", isLocal ? "local" : "global", + isUnsigned ? "u" : "", (int)typeSize * 8, + isUnsigned ? "" : " ", isLocal ? " " : ""); } //// Set up the kernel code // Create the pragma line for this kernel - bool isLong = ( dataType == kLong || dataType == kULong ); - sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n", - isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32", - extended ? "extended" : "base" ); + bool isLong = (dataType == kLong || dataType == kULong); + sprintf(pragma, + "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n", + isLong ? "" : (isLocal ? "_local" : "_global"), + isLong ? "64" : "32", extended ? "extended" : "base"); // Now create the program header - const char *typeName = get_explicit_type_name( dataType ); - if( isLocal ) - sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName ); + const char *typeName = get_explicit_type_name(dataType); + if (isLocal) + sprintf(programHeader, atomic_local_pattern[0], typeName, typeName, + typeName); else - sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName ); + sprintf(programHeader, atomic_global_pattern[0], typeName, typeName); // Set up our entire program now - programLines[ 0 ] = pragma; - programLines[ 1 ] = programHeader; - programLines[ 2 ] = programCore; - programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ]; - - if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) ) + programLines[0] = pragma; + programLines[1] = programHeader; + programLines[2] = programCore; + programLines[3] = + (isLocal) ? atomic_local_pattern[1] : atomic_global_pattern[1]; + + if (create_single_kernel_helper(context, &program, &kernel, 4, programLines, + "test_atomic_fn")) { return -1; } @@ -183,25 +205,37 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q //// Set up to actually run threadSize = num_elements; - error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize ); - test_error( error, "Unable to get thread group max size" ); + error = + get_max_common_work_group_size(context, kernel, threadSize, &groupSize); + test_error(error, "Unable to get thread group max size"); - if( matchGroupSize ) + if (matchGroupSize) // HACK because xchg and cmpxchg apparently are limited by hardware threadSize = groupSize; - if( isLocal ) + if (isLocal) { - size_t maxSizes[3] = {0, 0, 0}; - error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), maxSizes, 0); - test_error( error, "Unable to obtain max work item sizes for the device" ); + size_t maxSizes[3] = { 0, 0, 0 }; + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, + 3 * sizeof(size_t), maxSizes, 0); + test_error(error, + "Unable to obtain max work item sizes for the device"); size_t workSize; - error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL ); - test_error( error, "Unable to obtain max work group size for device and kernel combo" ); - - // "workSize" is limited to that of the first dimension as only a 1DRange is executed. - if( maxSizes[0] < workSize ) + error = clGetKernelWorkGroupInfo(kernel, deviceID, + CL_KERNEL_WORK_GROUP_SIZE, + sizeof(workSize), &workSize, NULL); + test_error( + error, + "Unable to obtain max work group size for device and kernel combo"); + + // Limit workSize to avoid extremely large local buffer size and slow + // run. + if (workSize > 65536) workSize = 65536; + + // "workSize" is limited to that of the first dimension as only a + // 1DRange is executed. + if (maxSizes[0] < workSize) { workSize = maxSizes[0]; } @@ -210,38 +244,43 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q } - log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize ); + log_info("\t(thread count %d, group size %d)\n", (int)threadSize, + (int)groupSize); - refValues = (cl_int *)malloc( typeSize * threadSize ); + refValues = (cl_int *)malloc(typeSize * threadSize); - if( testFns.GenerateRefsIntFn != NULL ) + if (testFns.GenerateRefsIntFn != NULL) { // We have a ref generator provided - d = init_genrand( gRandomSeed ); - startRefValues = malloc( typeSize * threadSize ); - if( typeSize == 4 ) - testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d ); + d = init_genrand(gRandomSeed); + startRefValues = malloc(typeSize * threadSize); + if (typeSize == 4) + testFns.GenerateRefsIntFn(threadSize, (cl_int *)startRefValues, d); else - testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d ); + testFns.GenerateRefsLongFn(threadSize, (cl_long *)startRefValues, + d); free_mtdata(d); d = NULL; } else startRefValues = NULL; - // If we're given a num_results function, we need to determine how many result objects we need. If - // we don't have it, we assume it's just 1 - size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1; + // If we're given a num_results function, we need to determine how many + // result objects we need. If we don't have it, we assume it's just 1 + size_t numDestItems = (testFns.NumResultsFn != NULL) + ? testFns.NumResultsFn(threadSize, dataType) + : 1; - char * destItems = new char[ typeSize * numDestItems ]; - if( destItems == NULL ) + char *destItems = new char[typeSize * numDestItems]; + if (destItems == NULL) { - log_error( "ERROR: Unable to allocate memory!\n" ); + log_error("ERROR: Unable to allocate memory!\n"); return -1; } - void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue; - for( size_t i = 0; i < numDestItems; i++ ) - memcpy( destItems + i * typeSize, startValue, typeSize ); + void *startValue = (typeSize == 4) ? (void *)&testFns.mIntStartValue + : (void *)&testFns.mLongStartValue; + for (size_t i = 0; i < numDestItems; i++) + memcpy(destItems + i * typeSize, startValue, typeSize); streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, typeSize * numDestItems, destItems, NULL); @@ -261,82 +300,97 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q } /* Set the arguments */ - error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] ); - test_error( error, "Unable to set indexed kernel arguments" ); - error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] ); - test_error( error, "Unable to set indexed kernel arguments" ); + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set indexed kernel arguments"); + error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set indexed kernel arguments"); - if( isLocal ) + if (isLocal) { - error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL ); - test_error( error, "Unable to set indexed local kernel argument" ); + error = clSetKernelArg(kernel, 2, typeSize * numDestItems, NULL); + test_error(error, "Unable to set indexed local kernel argument"); cl_int numDestItemsInt = (cl_int)numDestItems; - error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt ); - test_error( error, "Unable to set indexed kernel argument" ); + error = clSetKernelArg(kernel, 3, sizeof(cl_int), &numDestItemsInt); + test_error(error, "Unable to set indexed kernel argument"); } /* Run the kernel */ threads[0] = threadSize; - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL ); - test_error( error, "Unable to execute test kernel" ); - - error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL ); - test_error( error, "Unable to read result value!" ); - - error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL ); - test_error( error, "Unable to read reference values!" ); - - // If we have an expectedFn, then we need to generate a final value to compare against. If we don't - // have one, it's because we're comparing ref values only - if( testFns.ExpectedValueIntFn != NULL ) + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, &groupSize, + 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + + error = + clEnqueueReadBuffer(queue, streams[0], true, 0, typeSize * numDestItems, + destItems, 0, NULL, NULL); + test_error(error, "Unable to read result value!"); + + error = + clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize * threadSize, + refValues, 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); + + // If we have an expectedFn, then we need to generate a final value to + // compare against. If we don't have one, it's because we're comparing ref + // values only + if (testFns.ExpectedValueIntFn != NULL) { - for( size_t i = 0; i < numDestItems; i++ ) + for (size_t i = 0; i < numDestItems; i++) { - char expected[ 8 ]; + char expected[8]; cl_int intVal; cl_long longVal; - if( typeSize == 4 ) + if (typeSize == 4) { // Int version - intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i ); - memcpy( expected, &intVal, sizeof( intVal ) ); + intVal = testFns.ExpectedValueIntFn( + threadSize, (cl_int *)startRefValues, i); + memcpy(expected, &intVal, sizeof(intVal)); } else { // Long version - longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i ); - memcpy( expected, &longVal, sizeof( longVal ) ); + longVal = testFns.ExpectedValueLongFn( + threadSize, (cl_long *)startRefValues, i); + memcpy(expected, &longVal, sizeof(longVal)); } - if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 ) + if (memcmp(expected, destItems + i * typeSize, typeSize) != 0) { - if( typeSize == 4 ) + if (typeSize == 4) { - cl_int *outValue = (cl_int *)( destItems + i * typeSize ); - log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue ); + cl_int *outValue = (cl_int *)(destItems + i * typeSize); + log_error("ERROR: Result %zu from kernel does not " + "validate! (should be %d, was %d)\n", + i, intVal, *outValue); cl_int *startRefs = (cl_int *)startRefValues; cl_int *refs = (cl_int *)refValues; - for( i = 0; i < threadSize; i++ ) + for (i = 0; i < threadSize; i++) { - if( startRefs != NULL ) - log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] ); + if (startRefs != NULL) + log_info(" --- %zu - %d --- %d\n", i, startRefs[i], + refs[i]); else - log_info( " --- %ld --- %d\n", i, refs[i] ); + log_info(" --- %zu --- %d\n", i, refs[i]); } } else { - cl_long *outValue = (cl_long *)( destItems + i * typeSize ); - log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue ); + cl_long *outValue = (cl_long *)(destItems + i * typeSize); + log_error("ERROR: Result %zu from kernel does not " + "validate! (should be %" PRId64 ", was %" PRId64 + ")\n", + i, longVal, *outValue); cl_long *startRefs = (cl_long *)startRefValues; cl_long *refs = (cl_long *)refValues; - for( i = 0; i < threadSize; i++ ) + for (i = 0; i < threadSize; i++) { - if( startRefs != NULL ) - log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] ); + if (startRefs != NULL) + log_info(" --- %zu - %" PRId64 " --- %" PRId64 "\n", + i, startRefs[i], refs[i]); else - log_info( " --- %ld --- %lld\n", i, refs[i] ); + log_info(" --- %zu --- %" PRId64 "\n", i, refs[i]); } } return -1; @@ -344,104 +398,141 @@ int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_q } } - if( testFns.VerifyRefsIntFn != NULL ) + if (testFns.VerifyRefsIntFn != NULL) { /* Use the verify function to also check the results */ - if( dataType == kFloat ) + if (dataType == kFloat) { cl_float *outValue = (cl_float *)destItems; - if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 ) + if (!testFns.VerifyRefsFloatFn(threadSize, (cl_float *)refValues, + *outValue) + != 0) { - log_error( "ERROR: Reference values did not validate!\n" ); + log_error("ERROR: Reference values did not validate!\n"); return -1; } } - else if( typeSize == 4 ) + else if (typeSize == 4) { cl_int *outValue = (cl_int *)destItems; - if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 ) + if (!testFns.VerifyRefsIntFn(threadSize, (cl_int *)refValues, + *outValue) + != 0) { - log_error( "ERROR: Reference values did not validate!\n" ); + log_error("ERROR: Reference values did not validate!\n"); return -1; } } else { cl_long *outValue = (cl_long *)destItems; - if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 ) + if (!testFns.VerifyRefsLongFn(threadSize, (cl_long *)refValues, + *outValue) + != 0) { - log_error( "ERROR: Reference values did not validate!\n" ); + log_error("ERROR: Reference values did not validate!\n"); return -1; } } } - else if( testFns.ExpectedValueIntFn == NULL ) + else if (testFns.ExpectedValueIntFn == NULL) { - log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" ); + log_error("ERROR: Test doesn't check total or refs; no values are " + "verified!\n"); return -1; } /* Re-write the starting value */ - for( size_t i = 0; i < numDestItems; i++ ) - memcpy( destItems + i * typeSize, startValue, typeSize ); - error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL ); - test_error( error, "Unable to write starting values!" ); - - /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */ + for (size_t i = 0; i < numDestItems; i++) + memcpy(destItems + i * typeSize, startValue, typeSize); + error = + clEnqueueWriteBuffer(queue, streams[0], true, 0, + typeSize * numDestItems, destItems, 0, NULL, NULL); + test_error(error, "Unable to write starting values!"); + + /* Run the kernel once for a single thread, so we can verify that the + * returned value is the original one */ threads[0] = 1; - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL ); - test_error( error, "Unable to execute test kernel" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, threads, 0, + NULL, NULL); + test_error(error, "Unable to execute test kernel"); - error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL ); - test_error( error, "Unable to read reference values!" ); + error = clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize, refValues, + 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); - if( memcmp( refValues, destItems, typeSize ) != 0 ) + if (memcmp(refValues, destItems, typeSize) != 0) { - if( typeSize == 4 ) + if (typeSize == 4) { cl_int *s = (cl_int *)destItems; cl_int *r = (cl_int *)refValues; - log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value " - " (should have been %d, returned %d)!\n", *s, *r ); + log_error("ERROR: atomic function operated correctly but did NOT " + "return correct 'old' value " + " (should have been %d, returned %d)!\n", + *s, *r); } else { cl_long *s = (cl_long *)destItems; cl_long *r = (cl_long *)refValues; - log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value " - " (should have been %lld, returned %lld)!\n", *s, *r ); + log_error("ERROR: atomic function operated correctly but did NOT " + "return correct 'old' value " + " (should have been %" PRId64 ", returned %" PRId64 + ")!\n", + *s, *r); } return -1; } - delete [] destItems; - free( refValues ); - if( startRefValues != NULL ) - free( startRefValues ); + delete[] destItems; + free(refValues); + if (startRefValues != NULL) free(startRefValues); return 0; } -int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore, - TestFns testFns, - bool extended, bool matchGroupSize, bool usingAtomicPrefix ) +int test_atomic_function_set(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + const char *programCore, TestFns testFns, + bool extended, bool matchGroupSize, + bool usingAtomicPrefix) { - log_info(" Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_"); + log_info(" Testing %s functions...\n", + usingAtomicPrefix ? "atomic_" : "atom_"); int errors = 0; - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kInt, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kUInt, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kInt, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kUInt, matchGroupSize ); - - // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix. - // The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix. - if (!usingAtomicPrefix) { - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kLong, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kULong, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kLong, matchGroupSize ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kULong, matchGroupSize ); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, false, kInt, + matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, false, kUInt, + matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, true, kInt, + matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, true, kUInt, + matchGroupSize); + + // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 + // bit functions still use the "atom" prefix. The argument usingAtomicPrefix + // is set to true if programCore was generated with the "atomic" prefix. + if (!usingAtomicPrefix) + { + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, false, + kLong, matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, false, + kULong, matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, true, + kLong, matchGroupSize); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + programCore, testFns, extended, true, + kULong, matchGroupSize); } return errors; @@ -450,265 +541,346 @@ int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_comma #pragma mark ---- add const char atom_add_core[] = -" oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n" -" atom_add( &destMemory[0], tid + 3 );\n" -" atom_add( &destMemory[0], tid + 3 );\n" -" atom_add( &destMemory[0], tid + 3 );\n"; + " oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n" + " atom_add( &destMemory[0], tid + 3 );\n" + " atom_add( &destMemory[0], tid + 3 );\n" + " atom_add( &destMemory[0], tid + 3 );\n"; const char atomic_add_core[] = -" oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n" -" atomic_add( &destMemory[0], tid + 3 );\n" -" atomic_add( &destMemory[0], tid + 3 );\n" -" atomic_add( &destMemory[0], tid + 3 );\n"; + " oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n" + " atomic_add( &destMemory[0], tid + 3 );\n" + " atomic_add( &destMemory[0], tid + 3 );\n" + " atomic_add( &destMemory[0], tid + 3 );\n"; -cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_add_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { cl_int total = 0; - for( size_t i = 0; i < size; i++ ) - total += ( (cl_int)i + 3 ) * 4; + for (size_t i = 0; i < size; i++) total += ((cl_int)i + 3) * 4; return total; } -cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_add_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { cl_long total = 0; - for( size_t i = 0; i < size; i++ ) - total += ( ( i + 3 ) * 4 ); + for (size_t i = 0; i < size; i++) total += ((i + 3) * 4); return total; } -int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_add(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { 0, + 0LL, + NULL, + test_atomic_add_result_int, + NULL, + NULL, + test_atomic_add_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_add_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) + return -1; + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_add_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) - return -1; return 0; } #pragma mark ---- sub -const char atom_sub_core[] = " oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n"; +const char atom_sub_core[] = + " oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n"; -const char atomic_sub_core[] = " oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n"; +const char atomic_sub_core[] = + " oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n"; -cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_sub_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { cl_int total = INT_TEST_VALUE; - for( size_t i = 0; i < size; i++ ) - total -= (cl_int)i + 3; + for (size_t i = 0; i < size; i++) total -= (cl_int)i + 3; return total; } -cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_sub_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { cl_long total = LONG_TEST_VALUE; - for( size_t i = 0; i < size; i++ ) - total -= i + 3; + for (size_t i = 0; i < size; i++) total -= i + 3; return total; } -int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_sub(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { INT_TEST_VALUE, + LONG_TEST_VALUE, + NULL, + test_atomic_sub_result_int, + NULL, + NULL, + test_atomic_sub_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_sub_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_sub_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } #pragma mark ---- xchg -const char atom_xchg_core[] = " oldValues[tid] = atom_xchg( &destMemory[0], tid );\n"; +const char atom_xchg_core[] = + " oldValues[tid] = atom_xchg( &destMemory[0], tid );\n"; -const char atomic_xchg_core[] = " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n"; -const char atomic_xchg_float_core[] = " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n"; +const char atomic_xchg_core[] = + " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n"; +const char atomic_xchg_float_core[] = + " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n"; -bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue ) +bool test_atomic_xchg_verify_int(size_t size, cl_int *refValues, + cl_int finalValue) { - /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */ + /* For xchg, each value from 0 to size - 1 should have an entry in the ref + * array, and ONLY one entry */ char *valids; size_t i; char originalValidCount = 0; - valids = (char *)malloc( sizeof( char ) * size ); - memset( valids, 0, sizeof( char ) * size ); + valids = (char *)malloc(sizeof(char) * size); + memset(valids, 0, sizeof(char) * size); - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - if( refValues[ i ] == INT_TEST_VALUE ) + if (refValues[i] == INT_TEST_VALUE) { // Special initial value originalValidCount++; continue; } - if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size ) + if (refValues[i] < 0 || (size_t)refValues[i] >= size) { - log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] ); + log_error( + "ERROR: Reference value %zu outside of valid range! (%d)\n", i, + refValues[i]); return false; } - valids[ refValues[ i ] ] ++; + valids[refValues[i]]++; } - /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be - the final value outputted */ - if( valids[ finalValue ] > 0 ) + /* Note: ONE entry will have zero count. It'll be the last one that + executed, because that value should be the final value outputted */ + if (valids[finalValue] > 0) { - log_error( "ERROR: Final value %d was also in ref list!\n", finalValue ); + log_error("ERROR: Final value %d was also in ref list!\n", finalValue); return false; } else - valids[ finalValue ] = 1; // So the following loop will be okay + valids[finalValue] = 1; // So the following loop will be okay /* Now check that every entry has one and only one count */ - if( originalValidCount != 1 ) + if (originalValidCount != 1) { - log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount ); + log_error("ERROR: Starting reference value %d did not occur " + "once-and-only-once (occurred %d)\n", + 65191, originalValidCount); return false; } - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - if( valids[ i ] != 1 ) + if (valids[i] != 1) { - log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] ); - for( size_t j = 0; j < size; j++ ) - log_info( "%d: %d\n", (int)j, (int)valids[ j ] ); + log_error("ERROR: Reference value %zu did not occur " + "once-and-only-once (occurred %d)\n", + i, valids[i]); + for (size_t j = 0; j < size; j++) + log_info("%d: %d\n", (int)j, (int)valids[j]); return false; } } - free( valids ); + free(valids); return true; } -bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue ) +bool test_atomic_xchg_verify_long(size_t size, cl_long *refValues, + cl_long finalValue) { - /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */ + /* For xchg, each value from 0 to size - 1 should have an entry in the ref + * array, and ONLY one entry */ char *valids; size_t i; char originalValidCount = 0; - valids = (char *)malloc( sizeof( char ) * size ); - memset( valids, 0, sizeof( char ) * size ); + valids = (char *)malloc(sizeof(char) * size); + memset(valids, 0, sizeof(char) * size); - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - if( refValues[ i ] == LONG_TEST_VALUE ) + if (refValues[i] == LONG_TEST_VALUE) { // Special initial value originalValidCount++; continue; } - if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size ) + if (refValues[i] < 0 || (size_t)refValues[i] >= size) { - log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] ); + log_error( + "ERROR: Reference value %zu outside of valid range! (%" PRId64 + ")\n", + i, refValues[i]); return false; } - valids[ refValues[ i ] ] ++; + valids[refValues[i]]++; } - /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be - the final value outputted */ - if( valids[ finalValue ] > 0 ) + /* Note: ONE entry will have zero count. It'll be the last one that + executed, because that value should be the final value outputted */ + if (valids[finalValue] > 0) { - log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue ); + log_error("ERROR: Final value %" PRId64 " was also in ref list!\n", + finalValue); return false; } else - valids[ finalValue ] = 1; // So the following loop will be okay + valids[finalValue] = 1; // So the following loop will be okay /* Now check that every entry has one and only one count */ - if( originalValidCount != 1 ) + if (originalValidCount != 1) { - log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount ); + log_error("ERROR: Starting reference value %d did not occur " + "once-and-only-once (occurred %d)\n", + 65191, originalValidCount); return false; } - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - if( valids[ i ] != 1 ) + if (valids[i] != 1) { - log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] ); - for( size_t j = 0; j < size; j++ ) - log_info( "%d: %d\n", (int)j, (int)valids[ j ] ); + log_error("ERROR: Reference value %zu did not occur " + "once-and-only-once (occurred %d)\n", + i, valids[i]); + for (size_t j = 0; j < size; j++) + log_info("%d: %d\n", (int)j, (int)valids[j]); return false; } } - free( valids ); + free(valids); return true; } -bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue ) +bool test_atomic_xchg_verify_float(size_t size, cl_float *refValues, + cl_float finalValue) { - /* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */ + /* For xchg, each value from 0 to size - 1 should have an entry in the ref + * array, and ONLY one entry */ char *valids; size_t i; char originalValidCount = 0; - valids = (char *)malloc( sizeof( char ) * size ); - memset( valids, 0, sizeof( char ) * size ); + valids = (char *)malloc(sizeof(char) * size); + memset(valids, 0, sizeof(char) * size); - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - cl_int *intRefValue = (cl_int *)( &refValues[ i ] ); - if( *intRefValue == INT_TEST_VALUE ) + cl_int *intRefValue = (cl_int *)(&refValues[i]); + if (*intRefValue == INT_TEST_VALUE) { // Special initial value originalValidCount++; continue; } - if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size ) + if (refValues[i] < 0 || (size_t)refValues[i] >= size) { - log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] ); + log_error( + "ERROR: Reference value %zu outside of valid range! (%a)\n", i, + refValues[i]); return false; } - valids[ (int)refValues[ i ] ] ++; + valids[(int)refValues[i]]++; } - /* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be - the final value outputted */ - if( valids[ (int)finalValue ] > 0 ) + /* Note: ONE entry will have zero count. It'll be the last one that + executed, because that value should be the final value outputted */ + if (valids[(int)finalValue] > 0) { - log_error( "ERROR: Final value %a was also in ref list!\n", finalValue ); + log_error("ERROR: Final value %a was also in ref list!\n", finalValue); return false; } else - valids[ (int)finalValue ] = 1; // So the following loop will be okay + valids[(int)finalValue] = 1; // So the following loop will be okay /* Now check that every entry has one and only one count */ - if( originalValidCount != 1 ) + if (originalValidCount != 1) { - log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount ); + log_error("ERROR: Starting reference value %d did not occur " + "once-and-only-once (occurred %d)\n", + 65191, originalValidCount); return false; } - for( i = 0; i < size; i++ ) + for (i = 0; i < size; i++) { - if( valids[ i ] != 1 ) + if (valids[i] != 1) { - log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] ); - for( size_t j = 0; j < size; j++ ) - log_info( "%d: %d\n", (int)j, (int)valids[ j ] ); + log_error("ERROR: Reference value %zu did not occur " + "once-and-only-once (occurred %d)\n", + i, valids[i]); + for (size_t j = 0; j < size; j++) + log_info("%d: %d\n", (int)j, (int)valids[j]); return false; } } - free( valids ); + free(valids); return true; } -int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_xchg(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float }; - - int errors = test_atomic_function_set( deviceID, context, queue, num_elements, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false ); - errors |= test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true ); - - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, false, kFloat, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, true, kFloat, true ); + TestFns set = { INT_TEST_VALUE, + LONG_TEST_VALUE, + NULL, + NULL, + NULL, + test_atomic_xchg_verify_int, + NULL, + NULL, + test_atomic_xchg_verify_long, + NULL, + NULL, + test_atomic_xchg_verify_float }; + + int errors = test_atomic_function_set( + deviceID, context, queue, num_elements, atom_xchg_core, set, false, + true, /*usingAtomicPrefix*/ false); + errors |= test_atomic_function_set(deviceID, context, queue, num_elements, + atomic_xchg_core, set, false, true, + /*usingAtomicPrefix*/ true); + + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atomic_xchg_float_core, set, false, false, + kFloat, true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atomic_xchg_float_core, set, false, true, + kFloat, true); return errors; } @@ -716,51 +888,71 @@ int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue #pragma mark ---- min -const char atom_min_core[] = " oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n"; +const char atom_min_core[] = + " oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n"; -const char atomic_min_core[] = " oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n"; +const char atomic_min_core[] = + " oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n"; -cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_min_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { cl_int total = 0x7fffffffL; - for( size_t i = 0; i < size; i++ ) + for (size_t i = 0; i < size; i++) { - if( startRefValues[ i ] < total ) - total = startRefValues[ i ]; + if (startRefValues[i] < total) total = startRefValues[i]; } return total; } -void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d ) +void test_atomic_min_gen_int(size_t size, cl_int *startRefValues, MTdata d) { - for( size_t i = 0; i < size; i++ ) - startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff; + for (size_t i = 0; i < size; i++) + startRefValues[i] = + (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff; } -cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_min_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { cl_long total = 0x7fffffffffffffffLL; - for( size_t i = 0; i < size; i++ ) + for (size_t i = 0; i < size; i++) { - if( startRefValues[ i ] < total ) - total = startRefValues[ i ]; + if (startRefValues[i] < total) total = startRefValues[i]; } return total; } -void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d ) +void test_atomic_min_gen_long(size_t size, cl_long *startRefValues, MTdata d) { - for( size_t i = 0; i < size; i++ ) - startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) ); + for (size_t i = 0; i < size; i++) + startRefValues[i] = + (cl_long)(genrand_int32(d) + | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16)); } -int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_min(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { 0x7fffffffL, + 0x7fffffffffffffffLL, + NULL, + test_atomic_min_result_int, + test_atomic_min_gen_int, + NULL, + test_atomic_min_result_long, + test_atomic_min_gen_long, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_min_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_min_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } @@ -768,79 +960,118 @@ int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue #pragma mark ---- max -const char atom_max_core[] = " oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n"; +const char atom_max_core[] = + " oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n"; -const char atomic_max_core[] = " oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n"; +const char atomic_max_core[] = + " oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n"; -cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_max_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { cl_int total = 0; - for( size_t i = 0; i < size; i++ ) + for (size_t i = 0; i < size; i++) { - if( startRefValues[ i ] > total ) - total = startRefValues[ i ]; + if (startRefValues[i] > total) total = startRefValues[i]; } return total; } -void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d ) +void test_atomic_max_gen_int(size_t size, cl_int *startRefValues, MTdata d) { - for( size_t i = 0; i < size; i++ ) - startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff; + for (size_t i = 0; i < size; i++) + startRefValues[i] = + (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff; } -cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_max_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { cl_long total = 0; - for( size_t i = 0; i < size; i++ ) + for (size_t i = 0; i < size; i++) { - if( startRefValues[ i ] > total ) - total = startRefValues[ i ]; + if (startRefValues[i] > total) total = startRefValues[i]; } return total; } -void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d ) +void test_atomic_max_gen_long(size_t size, cl_long *startRefValues, MTdata d) { - for( size_t i = 0; i < size; i++ ) - startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) ); + for (size_t i = 0; i < size; i++) + startRefValues[i] = + (cl_long)(genrand_int32(d) + | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16)); } -int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_max(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { 0, + 0, + NULL, + test_atomic_max_result_int, + test_atomic_max_gen_int, + NULL, + test_atomic_max_result_long, + test_atomic_max_gen_long, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_max_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) + return -1; + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_max_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) - return -1; return 0; } #pragma mark ---- inc -const char atom_inc_core[] = " oldValues[tid] = atom_inc( &destMemory[0] );\n"; +const char atom_inc_core[] = + " oldValues[tid] = atom_inc( &destMemory[0] );\n"; -const char atomic_inc_core[] = " oldValues[tid] = atomic_inc( &destMemory[0] );\n"; +const char atomic_inc_core[] = + " oldValues[tid] = atomic_inc( &destMemory[0] );\n"; -cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_inc_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { return INT_TEST_VALUE + (cl_int)size; } -cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_inc_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { return LONG_TEST_VALUE + size; } -int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_inc(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { INT_TEST_VALUE, + LONG_TEST_VALUE, + NULL, + test_atomic_inc_result_int, + NULL, + NULL, + test_atomic_inc_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_inc_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_inc_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } @@ -848,27 +1079,46 @@ int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue #pragma mark ---- dec -const char atom_dec_core[] = " oldValues[tid] = atom_dec( &destMemory[0] );\n"; +const char atom_dec_core[] = + " oldValues[tid] = atom_dec( &destMemory[0] );\n"; -const char atomic_dec_core[] = " oldValues[tid] = atomic_dec( &destMemory[0] );\n"; +const char atomic_dec_core[] = + " oldValues[tid] = atomic_dec( &destMemory[0] );\n"; -cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) +cl_int test_atomic_dec_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { return INT_TEST_VALUE - (cl_int)size; } -cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_dec_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { return LONG_TEST_VALUE - size; } -int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_dec(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { INT_TEST_VALUE, + LONG_TEST_VALUE, + NULL, + test_atomic_dec_result_int, + NULL, + NULL, + test_atomic_dec_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_dec_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_dec_core, set, false, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } @@ -877,129 +1127,159 @@ int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue #pragma mark ---- cmpxchg /* We test cmpxchg by implementing (the long way) atom_add */ +// clang-format off const char atom_cmpxchg_core[] = -" int oldValue, origValue, newValue;\n" -" do { \n" -" origValue = destMemory[0];\n" -" newValue = origValue + tid + 2;\n" -" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n" -" } while( oldValue != origValue );\n" -" oldValues[tid] = oldValue;\n" -; + " int oldValue, origValue, newValue;\n" + " do { \n" + " origValue = destMemory[0];\n" + " newValue = origValue + tid + 2;\n" + " oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n" + " } while( oldValue != origValue );\n" + " oldValues[tid] = oldValue;\n"; const char atom_cmpxchg64_core[] = -" long oldValue, origValue, newValue;\n" -" do { \n" -" origValue = destMemory[0];\n" -" newValue = origValue + tid + 2;\n" -" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n" -" } while( oldValue != origValue );\n" -" oldValues[tid] = oldValue;\n" -; + " long oldValue, origValue, newValue;\n" + " do { \n" + " origValue = destMemory[0];\n" + " newValue = origValue + tid + 2;\n" + " oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n" + " } while( oldValue != origValue );\n" + " oldValues[tid] = oldValue;\n"; const char atomic_cmpxchg_core[] = -" int oldValue, origValue, newValue;\n" -" do { \n" -" origValue = destMemory[0];\n" -" newValue = origValue + tid + 2;\n" -" oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n" -" } while( oldValue != origValue );\n" -" oldValues[tid] = oldValue;\n" -; - -cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue ) + " int oldValue, origValue, newValue;\n" + " do { \n" + " origValue = destMemory[0];\n" + " newValue = origValue + tid + 2;\n" + " oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n" + " } while( oldValue != origValue );\n" + " oldValues[tid] = oldValue;\n"; +// clang-format on + +cl_int test_atomic_cmpxchg_result_int(size_t size, cl_int *startRefValues, + size_t whichDestValue) { cl_int total = INT_TEST_VALUE; - for( size_t i = 0; i < size; i++ ) - total += (cl_int)i + 2; + for (size_t i = 0; i < size; i++) total += (cl_int)i + 2; return total; } -cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue ) +cl_long test_atomic_cmpxchg_result_long(size_t size, cl_long *startRefValues, + size_t whichDestValue) { cl_long total = LONG_TEST_VALUE; - for( size_t i = 0; i < size; i++ ) - total += i + 2; + for (size_t i = 0; i < size; i++) total += i + 2; return total; } -int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL }; + TestFns set = { INT_TEST_VALUE, + LONG_TEST_VALUE, + NULL, + test_atomic_cmpxchg_result_int, + NULL, + NULL, + test_atomic_cmpxchg_result_long, + NULL, + NULL }; int errors = 0; log_info(" Testing atom_ functions...\n"); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kUInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kUInt, true ); - - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kLong, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kULong, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kLong, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kULong, true ); + errors |= + test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg_core, set, false, false, kInt, true); + errors |= + test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg_core, set, false, false, kUInt, true); + errors |= + test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg_core, set, false, true, kInt, true); + errors |= + test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg_core, set, false, true, kUInt, true); + + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg64_core, set, false, false, + kLong, true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg64_core, set, false, false, + kULong, true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg64_core, set, false, true, kLong, + true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atom_cmpxchg64_core, set, false, true, + kULong, true); log_info(" Testing atomic_ functions...\n"); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kUInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kInt, true ); - errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kUInt, true ); - - if( errors ) - return -1; + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atomic_cmpxchg_core, set, false, false, kInt, + true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atomic_cmpxchg_core, set, false, false, + kUInt, true); + errors |= + test_atomic_function(deviceID, context, queue, num_elements, + atomic_cmpxchg_core, set, false, true, kInt, true); + errors |= test_atomic_function(deviceID, context, queue, num_elements, + atomic_cmpxchg_core, set, false, true, kUInt, + true); + + if (errors) return -1; return 0; } #pragma mark -------- Bitwise functions -size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType ) +size_t test_bitwise_num_results(size_t threadCount, ExplicitType dataType) { - size_t numBits = get_explicit_type_size( dataType ) * 8; + size_t numBits = get_explicit_type_size(dataType) * 8; - return ( threadCount + numBits - 1 ) / numBits; + return (threadCount + numBits - 1) / numBits; } #pragma mark ---- and +// clang-format off const char atom_and_core[] = -" size_t numBits = sizeof( destMemory[0] ) * 8;\n" -" int whichResult = tid / numBits;\n" -" int bitIndex = tid - ( whichResult * numBits );\n" -"\n" -" oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n" -; + " size_t numBits = sizeof( destMemory[0] ) * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - ( whichResult * numBits );\n" + "\n" + " oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"; const char atomic_and_core[] = -" size_t numBits = sizeof( destMemory[0] ) * 8;\n" -" int whichResult = tid / numBits;\n" -" int bitIndex = tid - ( whichResult * numBits );\n" -"\n" -" oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n" -; + " size_t numBits = sizeof( destMemory[0] ) * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - ( whichResult * numBits );\n" + "\n" + " oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"; +// clang-format on -cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult ) +cl_int test_atomic_and_result_int(size_t size, cl_int *startRefValues, + size_t whichResult) { - size_t numThreads = ( (size_t)size + 31 ) / 32; - if( whichResult < numThreads - 1 ) - return 0; + size_t numThreads = ((size_t)size + 31) / 32; + if (whichResult < numThreads - 1) return 0; // Last item doesn't get and'ed on every bit, so we have to mask away size_t numBits = (size_t)size - whichResult * 32; cl_int bits = (cl_int)0xffffffffL; - for( size_t i = 0; i < numBits; i++ ) - bits &= ~( 1 << i ); + for (size_t i = 0; i < numBits; i++) bits &= ~(1 << i); return bits; } -cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult ) +cl_long test_atomic_and_result_long(size_t size, cl_long *startRefValues, + size_t whichResult) { - size_t numThreads = ( (size_t)size + 63 ) / 64; - if( whichResult < numThreads - 1 ) - return 0; + size_t numThreads = ((size_t)size + 63) / 64; + if (whichResult < numThreads - 1) return 0; // Last item doesn't get and'ed on every bit, so we have to mask away size_t numBits = (size_t)size - whichResult * 64; @@ -1009,14 +1289,28 @@ cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_ return bits; } -int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_and(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results, - test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { 0xffffffff, + 0xffffffffffffffffLL, + test_bitwise_num_results, + test_atomic_and_result_int, + NULL, + NULL, + test_atomic_and_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_and_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_and_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } @@ -1024,59 +1318,68 @@ int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue #pragma mark ---- or +// clang-format off const char atom_or_core[] = -" size_t numBits = sizeof( destMemory[0] ) * 8;\n" -" int whichResult = tid / numBits;\n" -" int bitIndex = tid - ( whichResult * numBits );\n" -"\n" -" oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n" -; + " size_t numBits = sizeof( destMemory[0] ) * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - ( whichResult * numBits );\n" + "\n" + " oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"; const char atomic_or_core[] = -" size_t numBits = sizeof( destMemory[0] ) * 8;\n" -" int whichResult = tid / numBits;\n" -" int bitIndex = tid - ( whichResult * numBits );\n" -"\n" -" oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n" -; - -cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult ) + " size_t numBits = sizeof( destMemory[0] ) * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - ( whichResult * numBits );\n" + "\n" + " oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"; +// clang-format on + +cl_int test_atomic_or_result_int(size_t size, cl_int *startRefValues, + size_t whichResult) { - size_t numThreads = ( (size_t)size + 31 ) / 32; - if( whichResult < numThreads - 1 ) - return 0xffffffff; + size_t numThreads = ((size_t)size + 31) / 32; + if (whichResult < numThreads - 1) return 0xffffffff; // Last item doesn't get and'ed on every bit, so we have to mask away size_t numBits = (size_t)size - whichResult * 32; cl_int bits = 0; - for( size_t i = 0; i < numBits; i++ ) - bits |= ( 1 << i ); + for (size_t i = 0; i < numBits; i++) bits |= (1 << i); return bits; } -cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult ) +cl_long test_atomic_or_result_long(size_t size, cl_long *startRefValues, + size_t whichResult) { - size_t numThreads = ( (size_t)size + 63 ) / 64; - if( whichResult < numThreads - 1 ) - return 0x0ffffffffffffffffLL; + size_t numThreads = ((size_t)size + 63) / 64; + if (whichResult < numThreads - 1) return 0x0ffffffffffffffffLL; // Last item doesn't get and'ed on every bit, so we have to mask away size_t numBits = (size_t)size - whichResult * 64; cl_long bits = 0; - for( size_t i = 0; i < numBits; i++ ) - bits |= ( 1LL << i ); + for (size_t i = 0; i < numBits; i++) bits |= (1LL << i); return bits; } -int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_or(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL }; + TestFns set = { + 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, + NULL, NULL, test_atomic_or_result_long, NULL, + NULL + }; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_or_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_or_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } @@ -1096,33 +1399,44 @@ const char atomic_xor_core[] = "\n" " oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n"; -cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult ) +cl_int test_atomic_xor_result_int(size_t size, cl_int *startRefValues, + size_t whichResult) { cl_int total = 0x2f08ab41; - for( size_t i = 0; i < size; i++ ) - total ^= ( 1 << ( i & 31 ) ); + for (size_t i = 0; i < size; i++) total ^= (1 << (i & 31)); return total; } -cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult ) +cl_long test_atomic_xor_result_long(size_t size, cl_long *startRefValues, + size_t whichResult) { cl_long total = 0x2f08ab418ba0541LL; - for( size_t i = 0; i < size; i++ ) - total ^= ( 1LL << ( i & 63 ) ); + for (size_t i = 0; i < size; i++) total ^= (1LL << (i & 63)); return total; } -int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_xor(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL }; - - if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 ) + TestFns set = { 0x2f08ab41, + 0x2f08ab418ba0541LL, + NULL, + test_atomic_xor_result_int, + NULL, + NULL, + test_atomic_xor_result_long, + NULL, + NULL }; + + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atom_xor_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false) + != 0) return -1; - if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 ) + if (test_atomic_function_set( + deviceID, context, queue, num_elements, atomic_xor_core, set, true, + /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true) + != 0) return -1; return 0; } - - - - diff --git a/test_conformance/atomics/test_indexed_cases.cpp b/test_conformance/atomics/test_indexed_cases.cpp index b85e3d24..2bba3e24 100644 --- a/test_conformance/atomics/test_indexed_cases.cpp +++ b/test_conformance/atomics/test_indexed_cases.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,48 +16,55 @@ #include "testBase.h" #include "harness/conversions.h" -const char * atomic_index_source = -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"// Counter keeps track of which index in counts we are using.\n" -"// We get that value, increment it, and then set that index in counts to our thread ID.\n" -"// At the end of this we should have all thread IDs in some random location in counts\n" -"// exactly once. If atom_add failed then we will write over various thread IDs and we\n" -"// will be missing some.\n" -"\n" -"__kernel void add_index_test(__global int *counter, __global int *counts) {\n" -" int tid = get_global_id(0);\n" -" \n" -" int counter_to_use = atom_add(counter, 1);\n" -" counts[counter_to_use] = tid;\n" -"}"; - -int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +// clang-format off +const char *atomic_index_source = + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "// Counter keeps track of which index in counts we are using.\n" + "// We get that value, increment it, and then set that index in counts to our thread ID.\n" + "// At the end of this we should have all thread IDs in some random location in counts\n" + "// exactly once. If atom_add failed then we will write over various thread IDs and we\n" + "// will be missing some.\n" + "\n" + "__kernel void add_index_test(__global int *counter, __global int *counts) {\n" + " int tid = get_global_id(0);\n" + " \n" + " int counter_to_use = atom_add(counter, 1);\n" + " counts[counter_to_use] = tid;\n" + "}"; +// clang-format on + +int test_atomic_add_index(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { clProgramWrapper program; clKernelWrapper kernel; clMemWrapper counter, counters; size_t numGlobalThreads, numLocalThreads; - int fail = 0, succeed = 0, err; + int fail = 0, err; - /* Check if atomics are supported. */ - if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) { - log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n"); - return 0; - } + /* Check if atomics are supported. */ + if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) + { + log_info("Base atomics not supported " + "(cl_khr_global_int32_base_atomics). Skipping test.\n"); + return 0; + } //===== add_index test // The index test replicates what particles does. - // It uses one memory location to keep track of the current index and then each thread - // does an atomic add to it to get its new location. The threads then write to their - // assigned location. At the end we check to make sure that each thread's ID shows up - // exactly once in the output. + // It uses one memory location to keep track of the current index and then + // each thread does an atomic add to it to get its new location. The threads + // then write to their assigned location. At the end we check to make sure + // that each thread's ID shows up exactly once in the output. numGlobalThreads = 2048; - if( create_single_kernel_helper( context, &program, &kernel, 1, &atomic_index_source, "add_index_test" ) ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + &atomic_index_source, "add_index_test")) return -1; - if( get_max_common_work_group_size( context, kernel, numGlobalThreads, &numLocalThreads ) ) + if (get_max_common_work_group_size(context, kernel, numGlobalThreads, + &numLocalThreads)) return -1; log_info("Execute global_threads:%d local_threads:%d\n", @@ -72,103 +79,148 @@ int test_atomic_add_index(cl_device_id deviceID, cl_context context, cl_command_ sizeof(cl_int) * numGlobalThreads, NULL, NULL); // Reset all those locations to -1 to indciate they have not been used. - cl_int *values = (cl_int*) malloc(sizeof(cl_int)*numGlobalThreads); - if (values == NULL) { - log_error("add_index_test FAILED to allocate memory for initial values.\n"); - fail = 1; succeed = -1; - } else { + cl_int *values = (cl_int *)malloc(sizeof(cl_int) * numGlobalThreads); + if (values == NULL) + { + log_error( + "add_index_test FAILED to allocate memory for initial values.\n"); + fail = 1; + } + else + { memset(values, -1, numLocalThreads); - unsigned int i=0; - for (i=0; i<numGlobalThreads; i++) - values[i] = -1; - int init=0; - err = clEnqueueWriteBuffer(queue, counters, true, 0, numGlobalThreads*sizeof(cl_int), values, 0, NULL, NULL); - err |= clEnqueueWriteBuffer(queue, counter, true, 0,1*sizeof(cl_int), &init, 0, NULL, NULL); - if (err) { - log_error("add_index_test FAILED to write initial values to arrays: %d\n", err); - fail=1; succeed=-1; - } else { + unsigned int i = 0; + for (i = 0; i < numGlobalThreads; i++) values[i] = -1; + int init = 0; + err = clEnqueueWriteBuffer(queue, counters, true, 0, + numGlobalThreads * sizeof(cl_int), values, 0, + NULL, NULL); + err |= clEnqueueWriteBuffer(queue, counter, true, 0, 1 * sizeof(cl_int), + &init, 0, NULL, NULL); + if (err) + { + log_error( + "add_index_test FAILED to write initial values to arrays: %d\n", + err); + fail = 1; + } + else + { err = clSetKernelArg(kernel, 0, sizeof(counter), &counter); err |= clSetKernelArg(kernel, 1, sizeof(counters), &counters); - if (err) { - log_error("add_index_test FAILED to set kernel arguments: %d\n", err); - fail=1; succeed=-1; - } else { - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numGlobalThreads, &numLocalThreads, 0, NULL, NULL ); - if (err) { - log_error("add_index_test FAILED to execute kernel: %d\n", err); - fail=1; succeed=-1; - } else { - err = clEnqueueReadBuffer( queue, counters, true, 0, sizeof(cl_int)*numGlobalThreads, values, 0, NULL, NULL ); - if (err) { - log_error("add_index_test FAILED to read back results: %d\n", err); - fail = 1; succeed=-1; - } else { + if (err) + { + log_error("add_index_test FAILED to set kernel arguments: %d\n", + err); + fail = 1; + } + else + { + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, + &numGlobalThreads, + &numLocalThreads, 0, NULL, NULL); + if (err) + { + log_error("add_index_test FAILED to execute kernel: %d\n", + err); + fail = 1; + } + else + { + err = clEnqueueReadBuffer(queue, counters, true, 0, + sizeof(cl_int) * numGlobalThreads, + values, 0, NULL, NULL); + if (err) + { + log_error( + "add_index_test FAILED to read back results: %d\n", + err); + fail = 1; + } + else + { unsigned int looking_for, index; - for (looking_for=0; looking_for<numGlobalThreads; looking_for++) { - int instances_found=0; - for (index=0; index<numGlobalThreads; index++) { - if (values[index]==(int)looking_for) + for (looking_for = 0; looking_for < numGlobalThreads; + looking_for++) + { + int instances_found = 0; + for (index = 0; index < numGlobalThreads; index++) + { + if (values[index] == (int)looking_for) instances_found++; } - if (instances_found != 1) { - log_error("add_index_test FAILED: wrong number of instances (%d!=1) for counter %d.\n", instances_found, looking_for); - fail = 1; succeed=-1; + if (instances_found != 1) + { + log_error( + "add_index_test FAILED: wrong number of " + "instances (%d!=1) for counter %d.\n", + instances_found, looking_for); + fail = 1; } } } } } } - if (!fail) { - log_info("add_index_test passed. Each thread used exactly one index.\n"); + if (!fail) + { + log_info( + "add_index_test passed. Each thread used exactly one index.\n"); } free(values); } return fail; } +// clang-format off const char *add_index_bin_kernel[] = { -"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" -"// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n" -"// using an atomic add to keep track of the current location to write into in each bin.\n" -"// This is the same as the memory update for the particles demo.\n" -"\n" -"__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n" -" int tid = get_global_id(0);\n" -"\n" -" int location = bin_assignments[tid];\n" -" int counter = atom_add(&bin_counters[location], 1);\n" -" bins[location*max_counts_per_bin + counter] = tid;\n" -"}" }; - -// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel -// using an atomic add to keep track of the current location to write into in each bin. -// This is the same as the memory update for the particles demo. -int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_context context, MTdata d) + "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n" + "// This test assigns a bunch of values to bins and then tries to put them in the bins in parallel\n" + "// using an atomic add to keep track of the current location to write into in each bin.\n" + "// This is the same as the memory update for the particles demo.\n" + "\n" + "__kernel void add_index_bin_test(__global int *bin_counters, __global int *bins, __global int *bin_assignments, int max_counts_per_bin) {\n" + " int tid = get_global_id(0);\n" + "\n" + " int location = bin_assignments[tid];\n" + " int counter = atom_add(&bin_counters[location], 1);\n" + " bins[location*max_counts_per_bin + counter] = tid;\n" + "}" }; +// clang-format on + +// This test assigns a bunch of values to bins and then tries to put them in the +// bins in parallel using an atomic add to keep track of the current location to +// write into in each bin. This is the same as the memory update for the +// particles demo. +int add_index_bin_test(size_t *global_threads, cl_command_queue queue, + cl_context context, MTdata d) { int number_of_items = (int)global_threads[0]; size_t local_threads[1]; int divisor = 12; - int number_of_bins = number_of_items/divisor; - int max_counts_per_bin = divisor*2; + int number_of_bins = number_of_items / divisor; + int max_counts_per_bin = divisor * 2; int fail = 0; - int succeed = 0; int err; clProgramWrapper program; clKernelWrapper kernel; - // log_info("add_index_bin_test: %d items, into %d bins, with a max of %d items per bin (bins is %d long).\n", - // number_of_items, number_of_bins, max_counts_per_bin, number_of_bins*max_counts_per_bin); + // log_info("add_index_bin_test: %d items, into %d bins, with a max of %d + // items per bin (bins is %d long).\n", + // number_of_items, number_of_bins, max_counts_per_bin, + // number_of_bins*max_counts_per_bin); //===== add_index_bin test // The index test replicates what particles does. - err = create_single_kernel_helper(context, &program, &kernel, 1, add_index_bin_kernel, "add_index_bin_test" ); - test_error( err, "Unable to create testing kernel" ); + err = + create_single_kernel_helper(context, &program, &kernel, 1, + add_index_bin_kernel, "add_index_bin_test"); + test_error(err, "Unable to create testing kernel"); - if( get_max_common_work_group_size( context, kernel, global_threads[0], &local_threads[0] ) ) + if (get_max_common_work_group_size(context, kernel, global_threads[0], + &local_threads[0])) return -1; log_info("Execute global_threads:%d local_threads:%d\n", @@ -185,152 +237,228 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_contex clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * number_of_items, NULL, NULL); - if (bin_counters == NULL) { + if (bin_counters == NULL) + { log_error("add_index_bin_test FAILED to allocate bin_counters.\n"); return -1; } - if (bins == NULL) { + if (bins == NULL) + { log_error("add_index_bin_test FAILED to allocate bins.\n"); return -1; } - if (bin_assignments == NULL) { + if (bin_assignments == NULL) + { log_error("add_index_bin_test FAILED to allocate bin_assignments.\n"); return -1; } // Initialize our storage - cl_int *l_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins); - if (!l_bin_counts) { - log_error("add_index_bin_test FAILED to allocate initial values for bin_counters.\n"); + cl_int *l_bin_counts = (cl_int *)malloc(sizeof(cl_int) * number_of_bins); + if (!l_bin_counts) + { + log_error("add_index_bin_test FAILED to allocate initial values for " + "bin_counters.\n"); return -1; } int i; - for (i=0; i<number_of_bins; i++) - l_bin_counts[i] = 0; - err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, l_bin_counts, 0, NULL, NULL); - if (err) { - log_error("add_index_bin_test FAILED to set initial values for bin_counters: %d\n", err); + for (i = 0; i < number_of_bins; i++) l_bin_counts[i] = 0; + err = clEnqueueWriteBuffer(queue, bin_counters, true, 0, + sizeof(cl_int) * number_of_bins, l_bin_counts, 0, + NULL, NULL); + if (err) + { + log_error("add_index_bin_test FAILED to set initial values for " + "bin_counters: %d\n", + err); return -1; } - cl_int *values = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin); - if (!values) { - log_error("add_index_bin_test FAILED to allocate initial values for bins.\n"); + cl_int *values = + (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin); + if (!values) + { + log_error( + "add_index_bin_test FAILED to allocate initial values for bins.\n"); return -1; } - for (i=0; i<number_of_bins*max_counts_per_bin; i++) - values[i] = -1; - err = clEnqueueWriteBuffer(queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, values, 0, NULL, NULL); - if (err) { - log_error("add_index_bin_test FAILED to set initial values for bins: %d\n", err); + for (i = 0; i < number_of_bins * max_counts_per_bin; i++) values[i] = -1; + err = clEnqueueWriteBuffer(queue, bins, true, 0, + sizeof(cl_int) * number_of_bins + * max_counts_per_bin, + values, 0, NULL, NULL); + if (err) + { + log_error( + "add_index_bin_test FAILED to set initial values for bins: %d\n", + err); return -1; } free(values); - cl_int *l_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_items); - if (!l_bin_assignments) { - log_error("add_index_bin_test FAILED to allocate initial values for l_bin_assignments.\n"); + cl_int *l_bin_assignments = + (cl_int *)malloc(sizeof(cl_int) * number_of_items); + if (!l_bin_assignments) + { + log_error("add_index_bin_test FAILED to allocate initial values for " + "l_bin_assignments.\n"); return -1; } - for (i=0; i<number_of_items; i++) { - int bin = random_in_range(0, number_of_bins-1, d); - while (l_bin_counts[bin] >= max_counts_per_bin) { - bin = random_in_range(0, number_of_bins-1, d); + for (i = 0; i < number_of_items; i++) + { + int bin = random_in_range(0, number_of_bins - 1, d); + while (l_bin_counts[bin] >= max_counts_per_bin) + { + bin = random_in_range(0, number_of_bins - 1, d); } if (bin >= number_of_bins) - log_error("add_index_bin_test internal error generating bin assignments: bin %d >= number_of_bins %d.\n", bin, number_of_bins); - if (l_bin_counts[bin]+1 > max_counts_per_bin) - log_error("add_index_bin_test internal error generating bin assignments: bin %d has more entries (%d) than max_counts_per_bin (%d).\n", bin, l_bin_counts[bin], max_counts_per_bin); + log_error("add_index_bin_test internal error generating bin " + "assignments: bin %d >= number_of_bins %d.\n", + bin, number_of_bins); + if (l_bin_counts[bin] + 1 > max_counts_per_bin) + log_error( + "add_index_bin_test internal error generating bin assignments: " + "bin %d has more entries (%d) than max_counts_per_bin (%d).\n", + bin, l_bin_counts[bin], max_counts_per_bin); l_bin_counts[bin]++; l_bin_assignments[i] = bin; - // log_info("item %d assigned to bin %d (%d items)\n", i, bin, l_bin_counts[bin]); + // log_info("item %d assigned to bin %d (%d items)\n", i, bin, + // l_bin_counts[bin]); } - err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, sizeof(cl_int)*number_of_items, l_bin_assignments, 0, NULL, NULL); - if (err) { - log_error("add_index_bin_test FAILED to set initial values for bin_assignments: %d\n", err); + err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0, + sizeof(cl_int) * number_of_items, + l_bin_assignments, 0, NULL, NULL); + if (err) + { + log_error("add_index_bin_test FAILED to set initial values for " + "bin_assignments: %d\n", + err); return -1; } // Setup the kernel err = clSetKernelArg(kernel, 0, sizeof(bin_counters), &bin_counters); err |= clSetKernelArg(kernel, 1, sizeof(bins), &bins); err |= clSetKernelArg(kernel, 2, sizeof(bin_assignments), &bin_assignments); - err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), &max_counts_per_bin); - if (err) { - log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", err); - fail=1; succeed=-1; + err |= clSetKernelArg(kernel, 3, sizeof(max_counts_per_bin), + &max_counts_per_bin); + if (err) + { + log_error("add_index_bin_test FAILED to set kernel arguments: %d\n", + err); + fail = 1; return -1; } - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL ); - if (err) { + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_threads, + local_threads, 0, NULL, NULL); + if (err) + { log_error("add_index_bin_test FAILED to execute kernel: %d\n", err); - fail=1; succeed=-1; + fail = 1; } - cl_int *final_bin_assignments = (cl_int*)malloc(sizeof(cl_int)*number_of_bins*max_counts_per_bin); - if (!final_bin_assignments) { - log_error("add_index_bin_test FAILED to allocate initial values for final_bin_assignments.\n"); + cl_int *final_bin_assignments = + (cl_int *)malloc(sizeof(cl_int) * number_of_bins * max_counts_per_bin); + if (!final_bin_assignments) + { + log_error("add_index_bin_test FAILED to allocate initial values for " + "final_bin_assignments.\n"); return -1; } - err = clEnqueueReadBuffer( queue, bins, true, 0, sizeof(cl_int)*number_of_bins*max_counts_per_bin, final_bin_assignments, 0, NULL, NULL ); - if (err) { + err = clEnqueueReadBuffer(queue, bins, true, 0, + sizeof(cl_int) * number_of_bins + * max_counts_per_bin, + final_bin_assignments, 0, NULL, NULL); + if (err) + { log_error("add_index_bin_test FAILED to read back bins: %d\n", err); - fail = 1; succeed=-1; + fail = 1; } - cl_int *final_bin_counts = (cl_int*)malloc(sizeof(cl_int)*number_of_bins); - if (!final_bin_counts) { - log_error("add_index_bin_test FAILED to allocate initial values for final_bin_counts.\n"); + cl_int *final_bin_counts = + (cl_int *)malloc(sizeof(cl_int) * number_of_bins); + if (!final_bin_counts) + { + log_error("add_index_bin_test FAILED to allocate initial values for " + "final_bin_counts.\n"); return -1; } - err = clEnqueueReadBuffer( queue, bin_counters, true, 0, sizeof(cl_int)*number_of_bins, final_bin_counts, 0, NULL, NULL ); - if (err) { - log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", err); - fail = 1; succeed=-1; + err = clEnqueueReadBuffer(queue, bin_counters, true, 0, + sizeof(cl_int) * number_of_bins, final_bin_counts, + 0, NULL, NULL); + if (err) + { + log_error("add_index_bin_test FAILED to read back bin_counters: %d\n", + err); + fail = 1; } // Verification. - int errors=0; + int errors = 0; int current_bin; int search; // Print out all the contents of the bins. // for (current_bin=0; current_bin<number_of_bins; current_bin++) // for (search=0; search<max_counts_per_bin; search++) - // log_info("[bin %d, entry %d] = %d\n", current_bin, search, final_bin_assignments[current_bin*max_counts_per_bin+search]); + // log_info("[bin %d, entry %d] = %d\n", current_bin, search, + // final_bin_assignments[current_bin*max_counts_per_bin+search]); // First verify that there are the correct number in each bin. - for (current_bin=0; current_bin<number_of_bins; current_bin++) { + for (current_bin = 0; current_bin < number_of_bins; current_bin++) + { int expected_number = l_bin_counts[current_bin]; int actual_number = final_bin_counts[current_bin]; - if (expected_number != actual_number) { - log_error("add_index_bin_test FAILED: bin %d reported %d entries when %d were expected.\n", current_bin, actual_number, expected_number); + if (expected_number != actual_number) + { + log_error("add_index_bin_test FAILED: bin %d reported %d entries " + "when %d were expected.\n", + current_bin, actual_number, expected_number); errors++; } - for (search=0; search<expected_number; search++) { - if (final_bin_assignments[current_bin*max_counts_per_bin+search] == -1) { - log_error("add_index_bin_test FAILED: bin %d had no entry at position %d when it should have had %d entries.\n", current_bin, search, expected_number); + for (search = 0; search < expected_number; search++) + { + if (final_bin_assignments[current_bin * max_counts_per_bin + search] + == -1) + { + log_error("add_index_bin_test FAILED: bin %d had no entry at " + "position %d when it should have had %d entries.\n", + current_bin, search, expected_number); errors++; } } - for (search=expected_number; search<max_counts_per_bin; search++) { - if (final_bin_assignments[current_bin*max_counts_per_bin+search] != -1) { - log_error("add_index_bin_test FAILED: bin %d had an extra entry at position %d when it should have had only %d entries.\n", current_bin, search, expected_number); + for (search = expected_number; search < max_counts_per_bin; search++) + { + if (final_bin_assignments[current_bin * max_counts_per_bin + search] + != -1) + { + log_error( + "add_index_bin_test FAILED: bin %d had an extra entry at " + "position %d when it should have had only %d entries.\n", + current_bin, search, expected_number); errors++; } } } // Now verify that the correct ones are in each bin int index; - for (index=0; index<number_of_items; index++) { + for (index = 0; index < number_of_items; index++) + { int expected_bin = l_bin_assignments[index]; int found_it = 0; - for (search=0; search<l_bin_counts[expected_bin]; search++) { - if (final_bin_assignments[expected_bin*max_counts_per_bin+search] == index) { + for (search = 0; search < l_bin_counts[expected_bin]; search++) + { + if (final_bin_assignments[expected_bin * max_counts_per_bin + + search] + == index) + { found_it = 1; } } - if (found_it == 0) { - log_error("add_index_bin_test FAILED: did not find item %d in bin %d.\n", index, expected_bin); + if (found_it == 0) + { + log_error( + "add_index_bin_test FAILED: did not find item %d in bin %d.\n", + index, expected_bin); errors++; } } @@ -341,41 +469,49 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue, cl_contex clReleaseMemObject(bin_counters); clReleaseMemObject(bins); clReleaseMemObject(bin_assignments); - if (errors == 0) { - log_info("add_index_bin_test passed. Each item was put in the correct bin in parallel.\n"); + if (errors == 0) + { + log_info("add_index_bin_test passed. Each item was put in the correct " + "bin in parallel.\n"); return 0; - } else { + } + else + { log_error("add_index_bin_test FAILED: %d errors.\n", errors); return -1; } } -int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_add_index_bin(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { //===== add_index_bin test size_t numGlobalThreads = 2048; - int iteration=0; + int iteration = 0; int err, failed = 0; - MTdata d = init_genrand( gRandomSeed ); - - /* Check if atomics are supported. */ - if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) { - log_info("Base atomics not supported (cl_khr_global_int32_base_atomics). Skipping test.\n"); - free_mtdata( d ); - return 0; - } + MTdata d = init_genrand(gRandomSeed); + + /* Check if atomics are supported. */ + if (!is_extension_available(deviceID, "cl_khr_global_int32_base_atomics")) + { + log_info("Base atomics not supported " + "(cl_khr_global_int32_base_atomics). Skipping test.\n"); + free_mtdata(d); + return 0; + } - for(iteration=0; iteration<10; iteration++) { - log_info("add_index_bin_test with %d elements:\n", (int)numGlobalThreads); - err = add_index_bin_test(&numGlobalThreads, queue, context, d); - if (err) { + for (iteration = 0; iteration < 10; iteration++) + { + log_info("add_index_bin_test with %d elements:\n", + (int)numGlobalThreads); + err = add_index_bin_test(&numGlobalThreads, queue, context, d); + if (err) + { failed++; break; } - numGlobalThreads*=2; + numGlobalThreads *= 2; } - free_mtdata( d ); + free_mtdata(d); return failed; } - - diff --git a/test_conformance/basic/test_arraycopy.cpp b/test_conformance/basic/test_arraycopy.cpp index 5a352869..d9dbcc1b 100644 --- a/test_conformance/basic/test_arraycopy.cpp +++ b/test_conformance/basic/test_arraycopy.cpp @@ -181,9 +181,8 @@ test_arraycopy(cl_device_id device, cl_context context, cl_command_queue queue, } } - // Keep track of multiple errors. - if (error_count != 0) - err = error_count; + // Keep track of multiple errors. + if (error_count != 0) err = error_count; if (err) log_error("\tCL_MEM_USE_HOST_PTR buffer with kernel copy FAILED\n"); diff --git a/test_conformance/basic/test_async_copy2D.cpp b/test_conformance/basic/test_async_copy2D.cpp index 9fbdcb6e..bf3f1552 100644 --- a/test_conformance/basic/test_async_copy2D.cpp +++ b/test_conformance/basic/test_async_copy2D.cpp @@ -25,77 +25,81 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel2D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int " - "lineCopiesPerWorkItem, int srcStride, int dstStride )\n" - "{\n" - " int i, j;\n" - // Zero the local storage first - " for(i=0; i<lineCopiesPerWorkItem; i++)\n" - " for(j=0; j<numElementsPerLine; j++)\n" - " localBuffer[ (get_local_id( 0 " - ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = " - "(%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_2D2D( (__local %s*)localBuffer, " - "(__global const " - "%s*)(src+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + " - "srcStride)), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, " - "srcStride, dstStride, 0 );\n" - // Wait for the copy to complete, then verify by manually copying to the - // dest - " wait_group_events( 1, &event );\n" - " for(i=0; i<lineCopiesPerWorkItem; i++)\n" - " for(j=0; j<numElementsPerLine; j++)\n" - " dst[ (get_global_id( 0 " - ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ] = " - "localBuffer[ (get_local_id( 0 " - ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + dstStride)+j ];\n" - "}\n"; - -static const char *async_local_to_global_kernel2D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int lineCopiesPerWorkgroup, int " - "lineCopiesPerWorkItem, int srcStride, int dstStride )\n" - "{\n" - " int i, j;\n" - // Zero the local storage first - " for(i=0; i<lineCopiesPerWorkItem; i++)\n" - " for(j=0; j<numElementsPerLine; j++)\n" - " localBuffer[ (get_local_id( 0 " - ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = " - "(%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " for(i=0; i<lineCopiesPerWorkItem; i++)\n" - " for(j=0; j<numElementsPerLine; j++)\n" - " localBuffer[ (get_local_id( 0 " - ")*lineCopiesPerWorkItem+i)*(numElementsPerLine + srcStride)+j ] = src[ " - "(get_global_id( 0 )*lineCopiesPerWorkItem+i)*(numElementsPerLine + " - "srcStride)+j ];\n" - // Do this to verify all kernels are done copying to the local buffer before - // we try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_2D2D((__global " - "%s*)(dst+lineCopiesPerWorkgroup*get_group_id(0)*(numElementsPerLine + " - "dstStride)), (__local const %s*)localBuffer, (size_t)numElementsPerLine, " - "(size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0 );\n" - " wait_group_events( 1, &event );\n" - "}\n"; +static const char *async_global_to_local_kernel2D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, + __local %s *localBuffer, int numElementsPerLine, + int lineCopiesPerWorkgroup, int lineCopiesPerWorkItem, + int srcStride, int dstStride) { + // Zero the local storage first + for (int i = 0; i < lineCopiesPerWorkItem; i++) { + for (int j = 0; j < numElementsPerLine; j++) { + const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j; + localBuffer[index] = (%s)(%s)0; + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we + // try the copy + barrier( CLK_LOCAL_MEM_FENCE ); + event_t event = async_work_group_copy_2D2D(localBuffer, 0, src, + lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(%s), + (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0); + + // Wait for the copy to complete, then verify by manually copying to the dest + wait_group_events(1, &event); + + for (int i = 0; i < lineCopiesPerWorkItem; i++) { + for (int j = 0; j < numElementsPerLine; j++) { + const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j; + const int global_index = (get_global_id(0) * lineCopiesPerWorkItem + i) * dstStride + j; + dst[global_index] = localBuffer[local_index]; + } + } +} +)OpenCLC"; + +static const char *async_local_to_global_kernel2D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer, + int numElementsPerLine, int lineCopiesPerWorkgroup, + int lineCopiesPerWorkItem, int srcStride, int dstStride) { + // Zero the local storage first + for (int i = 0; i < lineCopiesPerWorkItem; i++) { + for (int j = 0; j < numElementsPerLine; j++) { + const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j; + localBuffer[index] = (%s)(%s)0; + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < lineCopiesPerWorkItem; i++) { + for (int j = 0; j < numElementsPerLine; j++) { + const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j; + const int global_index = (get_global_id(0)*lineCopiesPerWorkItem + i) * srcStride + j; + localBuffer[local_index] = src[global_index]; + } + } + + // Do this to verify all kernels are done copying to the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + event_t event = async_work_group_copy_2D2D(dst, lineCopiesPerWorkgroup * get_group_id(0) * dstStride, + localBuffer, 0, sizeof(%s), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, + dstStride, 0 ); + + wait_group_events(1, &event); +}; +)OpenCLC"; int test_copy2D(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, - ExplicitType vecType, int vecSize, int srcStride, int dstStride, + ExplicitType vecType, int vecSize, int srcMargin, int dstMargin, bool localIsDst) { int error; @@ -114,8 +118,8 @@ int test_copy2D(cl_device_id deviceID, cl_context context, vecSize); size_t elementSize = get_explicit_type_size(vecType) * vecSize; - log_info("Testing %s with srcStride = %d, dstStride = %d\n", vecNameString, - srcStride, dstStride); + log_info("Testing %s with srcMargin = %d, dstMargin = %d\n", vecNameString, + srcMargin, dstMargin); cl_long max_local_mem_size; error = @@ -153,7 +157,7 @@ int test_copy2D(cl_device_id deviceID, cl_context context, vecType == kDouble ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" : "", vecNameString, vecNameString, vecNameString, vecNameString, - get_explicit_type_name(vecType), vecNameString, vecNameString); + get_explicit_type_name(vecType), vecNameString); // log_info("program: %s\n", programSource); programPtr = programSource; @@ -180,12 +184,17 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (max_workgroup_size > max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t lineCopiesPerWorkItem = 13; + const size_t numElementsPerLine = 10; + const cl_int dstStride = numElementsPerLine + dstMargin; + const cl_int srcStride = numElementsPerLine + srcMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem * elementSize - * (numElementsPerLine + (localIsDst ? dstStride : srcStride)); + + const size_t lineCopiesPerWorkItem = 13; + const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem + * elementSize * (localIsDst ? dstStride : srcStride); + size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -199,34 +208,39 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalLinesIn = (max_alloc_size / elementSize + srcStride) - / (numElementsPerLine + srcStride); - size_t maxTotalLinesOut = (max_alloc_size / elementSize + dstStride) - / (numElementsPerLine + dstStride); - size_t maxTotalLines = (std::min)(maxTotalLinesIn, maxTotalLinesOut); - size_t maxLocalWorkgroups = + + const size_t maxTotalLinesIn = + (max_alloc_size / elementSize + srcMargin) / srcStride; + const size_t maxTotalLinesOut = + (max_alloc_size / elementSize + dstMargin) / dstStride; + const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut); + const size_t maxLocalWorkgroups = maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstStride : srcStride); - size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups); - size_t totalLines = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstMargin : srcMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalLines = numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * srcStride); - size_t outBufferSize = elementSize - * (totalLines * numElementsPerLine + (totalLines - 1) * dstStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin); + const size_t outBufferSize = elementSize + * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int lineCopiesPerWorkItemInt, numElementsPerLineInt, - lineCopiesPerWorkgroup; - lineCopiesPerWorkItemInt = (int)lineCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - lineCopiesPerWorkgroup = (int)(lineCopiesPerWorkItem * localWorkgroupSize); + const cl_int lineCopiesPerWorkItemInt = + static_cast<cl_int>(lineCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast<cl_int>(numElementsPerLine); + const cl_int lineCopiesPerWorkgroup = + static_cast<cl_int>(lineCopiesPerWorkItem * localWorkgroupSize); log_info( "Global: %d, local %d, local buffer %db, global in buffer %db, " @@ -296,8 +310,8 @@ int test_copy2D(cl_device_id deviceID, cl_context context, for (int j = 0; j < (int)numElementsPerLine * elementSize; j += elementSize) { - int inIdx = i * (numElementsPerLine + srcStride) + j; - int outIdx = i * (numElementsPerLine + dstStride) + j; + int inIdx = i * srcStride + j; + int outIdx = i * dstStride + j; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -332,11 +346,10 @@ int test_copy2D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i * (numElementsPerLine + dstStride) - + numElementsPerLine * elementSize; + int outIdx = i * dstStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstStride * elementSize) + dstMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -373,9 +386,12 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line and the start of the next. The strides are equivalent to the + // length of the line plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcStride, dstStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcMargin, dstMargin; int errors = 0; @@ -401,19 +417,19 @@ int test_copy2D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcStride = 0; srcStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcStride++) + for (srcMargin = 0; srcMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcMargin++) { - for (dstStride = 0; - dstStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstStride++) + for (dstMargin = 0; + dstMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstMargin++) { if (test_copy2D(deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcStride], - smallTypesStrideSizes[dstStride], + smallTypesMarginSizes[srcMargin], + smallTypesMarginSizes[dstMargin], localIsDst)) { errors++; diff --git a/test_conformance/basic/test_async_copy3D.cpp b/test_conformance/basic/test_async_copy3D.cpp index 252159bc..5eb41ebc 100644 --- a/test_conformance/basic/test_async_copy3D.cpp +++ b/test_conformance/basic/test_async_copy3D.cpp @@ -25,96 +25,95 @@ #include "../../test_common/harness/conversions.h" #include "procs.h" -static const char *async_global_to_local_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ] = (%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_3D3D( (__local %s*)localBuffer, " - "(__global const " - "%s*)(src+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*" - "numElementsPerLine + numLines*srcLineStride + srcPlaneStride)), " - "(size_t)numElementsPerLine, (size_t)numLines, srcLineStride, " - "dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, dstPlaneStride, " - "0 );\n" - // Wait for the copy to complete, then verify by manually copying to the - // dest - " wait_group_events( 1, &event );\n" - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " dst[ (get_global_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ] = localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*dstLineStride + dstPlaneStride) + j*(numElementsPerLine + " - "dstLineStride) + k ];\n" - "}\n"; - -static const char *async_local_to_global_kernel3D = - "#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable\n" - "%s\n" // optional pragma string - "__kernel void test_fn( const __global %s *src, __global %s *dst, __local " - "%s *localBuffer, int numElementsPerLine, int numLines, int " - "planesCopiesPerWorkgroup, int planesCopiesPerWorkItem, int srcLineStride, " - "int dstLineStride, int srcPlaneStride, int dstPlaneStride )\n" - "{\n" - " int i, j, k;\n" - // Zero the local storage first - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ] = (%s)(%s)0;\n" - // Do this to verify all kernels are done zeroing the local buffer before we - // try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " for(i=0; i<planesCopiesPerWorkItem; i++)\n" - " for(j=0; j<numLines; j++)\n" - " for(k=0; k<numElementsPerLine; k++)\n" - " localBuffer[ (get_local_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ] = src[ (get_global_id( 0 " - ")*planesCopiesPerWorkItem+i)*(numLines*numElementsPerLine + " - "numLines*srcLineStride + srcPlaneStride) + j*(numElementsPerLine + " - "srcLineStride) + k ];\n" - // Do this to verify all kernels are done copying to the local buffer before - // we try the copy - " barrier( CLK_LOCAL_MEM_FENCE );\n" - " event_t event;\n" - " event = async_work_group_copy_3D3D((__global " - "%s*)(dst+planesCopiesPerWorkgroup*get_group_id(0)*(numLines*" - "numElementsPerLine + numLines*dstLineStride + dstPlaneStride)), (__local " - "const %s*)localBuffer, (size_t)numElementsPerLine, (size_t)numLines, " - "srcLineStride, dstLineStride, planesCopiesPerWorkgroup, srcPlaneStride, " - "dstPlaneStride, 0 );\n" - " wait_group_events( 1, &event );\n" - "}\n"; +static const char *async_global_to_local_kernel3D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer, + int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup, + int planesCopiesPerWorkItem, int srcLineStride, + int dstLineStride, int srcPlaneStride, int dstPlaneStride ) { + // Zero the local storage first + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for (int k = 0; k < numElementsPerLine; k++) { + const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + localBuffer[index] = (%s)(%s)0; + } + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + event_t event = async_work_group_copy_3D3D(localBuffer, 0, src, + planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride, + sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, + planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride, + dstPlaneStride, 0); + + // Wait for the copy to complete, then verify by manually copying to the dest + wait_group_events(1, &event); + + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for(int k = 0; k < numElementsPerLine; k++) { + const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k; + dst[global_index] = localBuffer[local_index]; + } + } + } +} +)OpenCLC"; + +static const char *async_local_to_global_kernel3D = R"OpenCLC( +#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable +%s // optional pragma string + +__kernel void test_fn(const __global %s *src, __global %s *dst, __local %s *localBuffer, + int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup, + int planesCopiesPerWorkItem, int srcLineStride, + int dstLineStride, int srcPlaneStride, int dstPlaneStride) { + // Zero the local storage first + for (int i = 0; i < planesCopiesPerWorkItem; i++) { + for (int j = 0; j < numLines; j++) { + for (int k = 0; k < numElementsPerLine; k++) { + const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k; + localBuffer[index] = (%s)(%s)0; + } + } + } + + // Do this to verify all kernels are done zeroing the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i=0; i < planesCopiesPerWorkItem; i++) { + for (int j=0; j < numLines; j++) { + for (int k=0; k < numElementsPerLine; k++) { + const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k; + const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k; + localBuffer[local_index] = src[global_index]; + } + } + } + + // Do this to verify all kernels are done copying to the local buffer before we try the copy + barrier(CLK_LOCAL_MEM_FENCE); + + event_t event = async_work_group_copy_3D3D(dst, + planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0, + sizeof(%s), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup, + srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0); + + wait_group_events(1, &event); +} +)OpenCLC"; int test_copy3D(cl_device_id deviceID, cl_context context, cl_command_queue queue, const char *kernelCode, - ExplicitType vecType, int vecSize, int srcLineStride, - int dstLineStride, int srcPlaneStride, int dstPlaneStride, + ExplicitType vecType, int vecSize, int srcLineMargin, + int dstLineMargin, int srcPlaneMargin, int dstPlaneMargin, bool localIsDst) { int error; @@ -133,10 +132,10 @@ int test_copy3D(cl_device_id deviceID, cl_context context, vecSize); size_t elementSize = get_explicit_type_size(vecType) * vecSize; - log_info("Testing %s with srcLineStride = %d, dstLineStride = %d, " - "srcPlaneStride = %d, dstPlaneStride = %d\n", - vecNameString, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride); + log_info("Testing %s with srcLineMargin = %d, dstLineMargin = %d, " + "srcPlaneMargin = %d, dstPlaneMargin = %d\n", + vecNameString, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin); cl_long max_local_mem_size; error = @@ -201,16 +200,20 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (max_workgroup_size > max_local_workgroup_size[0]) max_workgroup_size = max_local_workgroup_size[0]; - size_t numElementsPerLine = 10; - size_t numLines = 13; - size_t planesCopiesPerWorkItem = 2; + const size_t numElementsPerLine = 10; + const cl_int dstLineStride = numElementsPerLine + dstLineMargin; + const cl_int srcLineStride = numElementsPerLine + srcLineMargin; + + const size_t numLines = 13; + const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin; + const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin; + elementSize = get_explicit_type_size(vecType) * ((vecSize == 3) ? 4 : vecSize); - size_t localStorageSpacePerWorkitem = elementSize - * (planesCopiesPerWorkItem - * (numLines * numElementsPerLine - + numLines * (localIsDst ? dstLineStride : srcLineStride) - + (localIsDst ? dstPlaneStride : srcPlaneStride))); + const size_t planesCopiesPerWorkItem = 2; + const size_t localStorageSpacePerWorkitem = elementSize + * planesCopiesPerWorkItem + * (localIsDst ? dstPlaneStride : srcPlaneStride); size_t maxLocalWorkgroupSize = (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem); @@ -224,42 +227,41 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (maxLocalWorkgroupSize > max_workgroup_size) localWorkgroupSize = max_workgroup_size; - size_t maxTotalPlanesIn = ((max_alloc_size / elementSize) + srcPlaneStride) - / ((numLines * numElementsPerLine + numLines * srcLineStride) - + srcPlaneStride); - size_t maxTotalPlanesOut = ((max_alloc_size / elementSize) + dstPlaneStride) - / ((numLines * numElementsPerLine + numLines * dstLineStride) - + dstPlaneStride); - size_t maxTotalPlanes = (std::min)(maxTotalPlanesIn, maxTotalPlanesOut); - size_t maxLocalWorkgroups = + const size_t maxTotalPlanesIn = + ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride; + const size_t maxTotalPlanesOut = + ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride; + const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut); + const size_t maxLocalWorkgroups = maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem); - size_t localBufferSize = localWorkgroupSize * localStorageSpacePerWorkitem - - (localIsDst ? dstPlaneStride : srcPlaneStride); - size_t numberOfLocalWorkgroups = (std::min)(1111, (int)maxLocalWorkgroups); - size_t totalPlanes = + const size_t localBufferSize = + localWorkgroupSize * localStorageSpacePerWorkitem + - (localIsDst ? dstPlaneMargin : srcPlaneMargin); + const size_t numberOfLocalWorkgroups = + std::min(1111, (int)maxLocalWorkgroups); + const size_t totalPlanes = numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem; - size_t inBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * srcLineStride) - + (totalPlanes - 1) * srcPlaneStride); - size_t outBufferSize = elementSize - * (totalPlanes - * (numLines * numElementsPerLine + numLines * dstLineStride) - + (totalPlanes - 1) * dstPlaneStride); - size_t globalWorkgroupSize = numberOfLocalWorkgroups * localWorkgroupSize; + const size_t inBufferSize = elementSize + * (totalPlanes * numLines * srcLineStride + + (totalPlanes - 1) * srcPlaneMargin); + const size_t outBufferSize = elementSize + * (totalPlanes * numLines * dstLineStride + + (totalPlanes - 1) * dstPlaneMargin); + const size_t globalWorkgroupSize = + numberOfLocalWorkgroups * localWorkgroupSize; inBuffer = (void *)malloc(inBufferSize); outBuffer = (void *)malloc(outBufferSize); outBufferCopy = (void *)malloc(outBufferSize); - cl_int planesCopiesPerWorkItemInt, numElementsPerLineInt, numLinesInt, - planesCopiesPerWorkgroup; - planesCopiesPerWorkItemInt = (int)planesCopiesPerWorkItem; - numElementsPerLineInt = (int)numElementsPerLine; - numLinesInt = (int)numLines; - planesCopiesPerWorkgroup = - (int)(planesCopiesPerWorkItem * localWorkgroupSize); + const cl_int planesCopiesPerWorkItemInt = + static_cast<cl_int>(planesCopiesPerWorkItem); + const cl_int numElementsPerLineInt = + static_cast<cl_int>(numElementsPerLine); + const cl_int numLinesInt = static_cast<cl_int>(numLines); + const cl_int planesCopiesPerWorkgroup = + static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize); log_info("Global: %d, local %d, local buffer %db, global in buffer %db, " "global out buffer %db, each work group will copy %d planes and " @@ -336,14 +338,8 @@ int test_copy3D(cl_device_id deviceID, cl_context context, for (int k = 0; k < (int)numElementsPerLine * elementSize; k += elementSize) { - int inIdx = i - * (numLines * numElementsPerLine - + numLines * srcLineStride + srcPlaneStride) - + j * (numElementsPerLine + srcLineStride) + k; - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + k; + int inIdx = i * srcPlaneStride + j * srcLineStride + k; + int outIdx = i * dstPlaneStride + j * dstLineStride + k; if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx, typeSize) != 0) @@ -378,14 +374,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, } if (j < (int)numLines * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine - + numLines * dstLineStride + dstPlaneStride) - + j * (numElementsPerLine + dstLineStride) + int outIdx = i * dstPlaneStride + j * dstLineStride + numElementsPerLine * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstLineStride * elementSize) + dstLineMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -409,14 +402,11 @@ int test_copy3D(cl_device_id deviceID, cl_context context, if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1) * elementSize) { - int outIdx = i - * (numLines * numElementsPerLine + numLines * dstLineStride - + dstPlaneStride) - + (numLines * elementSize) * (numElementsPerLine) - + (numLines * elementSize) * (dstLineStride); + int outIdx = + i * dstPlaneStride + numLines * dstLineStride * elementSize; if (memcmp(((char *)outBuffer) + outIdx, ((char *)outBufferCopy) + outIdx, - dstPlaneStride * elementSize) + dstPlaneMargin * elementSize) != 0) { if (failuresPrinted == 0) @@ -453,10 +443,13 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + // The margins below represent the number of elements between the end of + // one line or plane and the start of the next. The strides are equivalent + // to the size of the line or plane plus the chosen margin. unsigned int vecSizes[] = { 1, 2, 3, 4, 8, 16, 0 }; - unsigned int smallTypesStrideSizes[] = { 0, 10, 100 }; - unsigned int size, typeIndex, srcLineStride, dstLineStride, srcPlaneStride, - dstPlaneStride; + unsigned int smallTypesMarginSizes[] = { 0, 10, 100 }; + unsigned int size, typeIndex, srcLineMargin, dstLineMargin, srcPlaneMargin, + dstPlaneMargin; int errors = 0; @@ -482,33 +475,33 @@ int test_copy3D_all_types(cl_device_id deviceID, cl_context context, if (get_explicit_type_size(vecType[typeIndex]) * vecSizes[size] <= 2) // small type { - for (srcLineStride = 0; - srcLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcLineStride++) + for (srcLineMargin = 0; + srcLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcLineMargin++) { - for (dstLineStride = 0; - dstLineStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstLineStride++) + for (dstLineMargin = 0; + dstLineMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstLineMargin++) { - for (srcPlaneStride = 0; - srcPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - srcPlaneStride++) + for (srcPlaneMargin = 0; + srcPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + srcPlaneMargin++) { - for (dstPlaneStride = 0; - dstPlaneStride < sizeof(smallTypesStrideSizes) - / sizeof(smallTypesStrideSizes[0]); - dstPlaneStride++) + for (dstPlaneMargin = 0; + dstPlaneMargin < sizeof(smallTypesMarginSizes) + / sizeof(smallTypesMarginSizes[0]); + dstPlaneMargin++) { if (test_copy3D( deviceID, context, queue, kernelCode, vecType[typeIndex], vecSizes[size], - smallTypesStrideSizes[srcLineStride], - smallTypesStrideSizes[dstLineStride], - smallTypesStrideSizes[srcPlaneStride], - smallTypesStrideSizes[dstPlaneStride], + smallTypesMarginSizes[srcLineMargin], + smallTypesMarginSizes[dstLineMargin], + smallTypesMarginSizes[srcPlaneMargin], + smallTypesMarginSizes[dstPlaneMargin], localIsDst)) { errors++; diff --git a/test_conformance/basic/test_enqueue_map.cpp b/test_conformance/basic/test_enqueue_map.cpp index 3702726f..d28f7e41 100644 --- a/test_conformance/basic/test_enqueue_map.cpp +++ b/test_conformance/basic/test_enqueue_map.cpp @@ -146,7 +146,7 @@ int test_enqueue_map_image(cl_device_id deviceID, cl_context context, cl_command clMemWrapper memObject; log_info("Testing with cl_mem_flags src: %s\n", flag_set_names[src_flag_id]); - generate_random_data(kUInt, (unsigned int)(imageSize * imageSize), d, + generate_random_data(kUInt, (unsigned int)(imageSize * imageSize * 4), d, hostPtrData); memcpy(referenceData, hostPtrData, imageDataSize); diff --git a/test_conformance/basic/test_enqueued_local_size.cpp b/test_conformance/basic/test_enqueued_local_size.cpp index f52162a8..ea95df68 100644 --- a/test_conformance/basic/test_enqueued_local_size.cpp +++ b/test_conformance/basic/test_enqueued_local_size.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -14,42 +14,45 @@ // limitations under the License. // #include "harness/compat.h" +#include "harness/rounding_mode.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> -#include "harness/rounding_mode.h" + +#include <algorithm> #include "procs.h" -static const char *enqueued_local_size_2d_code = -"__kernel void test_enqueued_local_size_2d(global int *dst)\n" -"{\n" -" if ((get_global_id(0) == 0) && (get_global_id(1) == 0))\n" -" {\n" -" dst[0] = (int)get_enqueued_local_size(0)\n;" -" dst[1] = (int)get_enqueued_local_size(1)\n;" -" }\n" -"}\n"; - -static const char *enqueued_local_size_1d_code = -"__kernel void test_enqueued_local_size_1d(global int *dst)\n" -"{\n" -" int tid_x = get_global_id(0);\n" -" if (get_global_id(0) == 0)\n" -" {\n" -" dst[tid_x] = (int)get_enqueued_local_size(0)\n;" -" }\n" -"}\n"; - - -static int -verify_enqueued_local_size(int *result, size_t *expected, int n) +static const char *enqueued_local_size_2d_code = R"( +__kernel void test_enqueued_local_size_2d(global int *dst) +{ + if ((get_global_id(0) == 0) && (get_global_id(1) == 0)) + { + dst[0] = (int)get_enqueued_local_size(0); + dst[1] = (int)get_enqueued_local_size(1); + } +} +)"; + +static const char *enqueued_local_size_1d_code = R"( +__kernel void test_enqueued_local_size_1d(global int *dst) +{ + int tid_x = get_global_id(0); + if (get_global_id(0) == 0) + { + dst[tid_x] = (int)get_enqueued_local_size(0); + } +} +)"; + + +static int verify_enqueued_local_size(int *result, size_t *expected, int n) { int i; - for (i=0; i<n; i++) + for (i = 0; i < n; i++) { if (result[i] != (int)expected[i]) { @@ -62,14 +65,14 @@ verify_enqueued_local_size(int *result, size_t *expected, int n) } -int -test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) +int test_enqueued_local_size(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { - cl_mem streams; - cl_program program[2]; - cl_kernel kernel[2]; + clMemWrapper stream; + clProgramWrapper program[2]; + clKernelWrapper kernel[2]; - int *output_ptr; + cl_int output_ptr[2]; size_t globalsize[2]; size_t localsize[2]; int err; @@ -95,37 +98,36 @@ test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_que } } - output_ptr = (int*)malloc(2 * sizeof(int)); - - streams = - clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * sizeof(int), NULL, &err); - test_error( err, "clCreateBuffer failed."); + stream = clCreateBuffer(context, CL_MEM_READ_WRITE, 2 * sizeof(cl_int), + nullptr, &err); + test_error(err, "clCreateBuffer failed."); std::string cl_std = "-cl-std=CL"; cl_std += (get_device_cl_version(device) == Version(3, 0)) ? "3.0" : "2.0"; err = create_single_kernel_helper_with_build_options( context, &program[0], &kernel[0], 1, &enqueued_local_size_1d_code, "test_enqueued_local_size_1d", cl_std.c_str()); - test_error( err, "create_single_kernel_helper failed"); + test_error(err, "create_single_kernel_helper failed"); err = create_single_kernel_helper_with_build_options( context, &program[1], &kernel[1], 1, &enqueued_local_size_2d_code, "test_enqueued_local_size_2d", cl_std.c_str()); - test_error( err, "create_single_kernel_helper failed"); + test_error(err, "create_single_kernel_helper failed"); - err = clSetKernelArg(kernel[0], 0, sizeof streams, &streams); - test_error( err, "clSetKernelArgs failed."); - err = clSetKernelArg(kernel[1], 0, sizeof streams, &streams); - test_error( err, "clSetKernelArgs failed."); + err = clSetKernelArg(kernel[0], 0, sizeof stream, &stream); + test_error(err, "clSetKernelArgs failed."); + err = clSetKernelArg(kernel[1], 0, sizeof stream, &stream); + test_error(err, "clSetKernelArgs failed."); - globalsize[0] = (size_t)num_elements; - globalsize[1] = (size_t)num_elements; + globalsize[0] = static_cast<size_t>(num_elements); + globalsize[1] = static_cast<size_t>(num_elements); size_t max_wgs; - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_wgs), &max_wgs, NULL); - test_error( err, "clGetDeviceInfo failed."); + err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(max_wgs), &max_wgs, nullptr); + test_error(err, "clGetDeviceInfo failed."); - localsize[0] = MIN(16, max_wgs); - localsize[1] = MIN(11, max_wgs / localsize[0]); + localsize[0] = std::min<size_t>(16, max_wgs); + localsize[1] = std::min<size_t>(11, max_wgs / localsize[0]); // If we need to use uniform workgroups because non-uniform workgroups are // not supported, round up to the next global size that is divisible by the // local size. @@ -141,35 +143,31 @@ test_enqueued_local_size(cl_device_id device, cl_context context, cl_command_que } } - err = clEnqueueNDRangeKernel(queue, kernel[1], 2, NULL, globalsize, localsize, 0, NULL, NULL); - test_error( err, "clEnqueueNDRangeKernel failed."); + err = clEnqueueNDRangeKernel(queue, kernel[1], 2, nullptr, globalsize, + localsize, 0, nullptr, nullptr); + test_error(err, "clEnqueueNDRangeKernel failed."); - err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL); - test_error( err, "clEnqueueReadBuffer failed."); + err = clEnqueueReadBuffer(queue, stream, CL_BLOCKING, 0, 2 * sizeof(int), + output_ptr, 0, nullptr, nullptr); + test_error(err, "clEnqueueReadBuffer failed."); err = verify_enqueued_local_size(output_ptr, localsize, 2); - globalsize[0] = (size_t)num_elements; + globalsize[0] = static_cast<size_t>(num_elements); localsize[0] = 9; if (use_uniform_work_groups && (globalsize[0] % localsize[0])) { globalsize[0] += (localsize[0] - (globalsize[0] % localsize[0])); } - err = clEnqueueNDRangeKernel(queue, kernel[1], 1, NULL, globalsize, localsize, 0, NULL, NULL); - test_error( err, "clEnqueueNDRangeKernel failed."); + err = clEnqueueNDRangeKernel(queue, kernel[1], 1, nullptr, globalsize, + localsize, 0, nullptr, nullptr); + test_error(err, "clEnqueueNDRangeKernel failed."); - err = clEnqueueReadBuffer(queue, streams, CL_TRUE, 0, 2*sizeof(int), output_ptr, 0, NULL, NULL); - test_error( err, "clEnqueueReadBuffer failed."); + err = clEnqueueReadBuffer(queue, stream, CL_BLOCKING, 0, 2 * sizeof(int), + output_ptr, 0, nullptr, nullptr); + test_error(err, "clEnqueueReadBuffer failed."); err = verify_enqueued_local_size(output_ptr, localsize, 1); - // cleanup - clReleaseMemObject(streams); - clReleaseKernel(kernel[0]); - clReleaseKernel(kernel[1]); - clReleaseProgram(program[0]); - clReleaseProgram(program[1]); - free(output_ptr); - return err; } diff --git a/test_conformance/basic/test_fpmath_float.cpp b/test_conformance/basic/test_fpmath_float.cpp index 6e5deb4b..60d509b0 100644 --- a/test_conformance/basic/test_fpmath_float.cpp +++ b/test_conformance/basic/test_fpmath_float.cpp @@ -49,8 +49,6 @@ static const char *fpmul_kernel_code = "}\n"; -static const float MAX_ERR = 1e-5f; - static int verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n) { diff --git a/test_conformance/basic/test_hiloeo.cpp b/test_conformance/basic/test_hiloeo.cpp index 4cdf2ac7..3470ad00 100644 --- a/test_conformance/basic/test_hiloeo.cpp +++ b/test_conformance/basic/test_hiloeo.cpp @@ -43,8 +43,6 @@ static const unsigned int out_vector_idx[] = { 0, 0, 1, 1, 3, 4}; // input type name is strcat(gentype, vector_size_names[i]); // and output type name is // strcat(gentype, vector_size_names[out_vector_idx[i]]); -static const int size_to_idx[] = {-1,0,1,2,3,-1,-1,-1,4, - -1,-1,-1,-1,-1,-1,-1,5}; static const char *vector_size_names[] = { "", "2", "3", "4", "8", "16"}; static const size_t kSizes[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 }; diff --git a/test_conformance/basic/test_hostptr.cpp b/test_conformance/basic/test_hostptr.cpp index 65af5c3c..dee78675 100644 --- a/test_conformance/basic/test_hostptr.cpp +++ b/test_conformance/basic/test_hostptr.cpp @@ -32,8 +32,6 @@ const char *hostptr_kernel_code = " dst[tid] = srcA[tid] + srcB[tid];\n" "}\n"; -static const float MAX_ERR = 1e-5f; - static int verify_hostptr(cl_float *inptrA, cl_float *inptrB, cl_float *outptr, int n) { cl_float r; diff --git a/test_conformance/basic/test_multireadimageonefmt.cpp b/test_conformance/basic/test_multireadimageonefmt.cpp index b37c8414..c230e67a 100644 --- a/test_conformance/basic/test_multireadimageonefmt.cpp +++ b/test_conformance/basic/test_multireadimageonefmt.cpp @@ -153,14 +153,14 @@ int test_mri_one(cl_device_id device, cl_context context, cl_command_queue queue err = clSetKernelArg(kernel, 0, sizeof i, &i); err |= clSetKernelArg(kernel, 1, sizeof err, &err); err |= clSetKernelArg(kernel, 2, sizeof sampler, &sampler); - for (i=0; i<8; i++) - err |= clSetKernelArg(kernel, 3+i, sizeof streams[i], &streams[i]); + for (i = 0; i < 8; i++) + err |= clSetKernelArg(kernel, 3 + i, sizeof streams[i], &streams[i]); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } + if (err != CL_SUCCESS) + { + log_error("clSetKernelArgs failed\n"); + return -1; + } threads[0] = (unsigned int)img_width; threads[1] = (unsigned int)img_height; @@ -182,15 +182,13 @@ int test_mri_one(cl_device_id device, cl_context context, cl_command_queue queue // cleanup clReleaseSampler(sampler); - for (i=0; i<8; i++) - clReleaseMemObject(streams[i]); + for (i = 0; i < 8; i++) clReleaseMemObject(streams[i]); clReleaseKernel(kernel); clReleaseProgram(program); - for (i=0; i<7; i++) - free(input_ptr[i]); - free(output_ptr); + for (i = 0; i < 7; i++) free(input_ptr[i]); + free(output_ptr); - return err; + return err; } diff --git a/test_conformance/basic/test_preprocessors.cpp b/test_conformance/basic/test_preprocessors.cpp index 2038d150..e67487eb 100644 --- a/test_conformance/basic/test_preprocessors.cpp +++ b/test_conformance/basic/test_preprocessors.cpp @@ -97,10 +97,10 @@ int test_kernel_preprocessor_macros(cl_device_id deviceID, cl_context context, c char programSource[4096]; char curFileName[512]; char *programPtr = programSource; - int i = 0; snprintf(curFileName, 512, "%s", __FILE__); #ifdef _WIN32 // Replace "\" with "\\" + int i = 0; while(curFileName[i] != '\0') { if (curFileName[i] == '\\') { int j = i + 1; diff --git a/test_conformance/basic/test_progvar.cpp b/test_conformance/basic/test_progvar.cpp index 62c0a6be..e202d276 100644 --- a/test_conformance/basic/test_progvar.cpp +++ b/test_conformance/basic/test_progvar.cpp @@ -15,12 +15,13 @@ // #include "harness/compat.h" -// Bug: Missing in spec: atomic_intptr_t is always supported if device is 32-bits. +// Bug: Missing in spec: atomic_intptr_t is always supported if device is +// 32-bits. // Bug: Missing in spec: CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE #define FLUSH fflush(stdout) -#define MAX_STR 16*1024 +#define MAX_STR 16 * 1024 #define ALIGNMENT 128 @@ -66,7 +67,11 @@ static int l_host_is_big_endian = 1; static size_t l_max_global_id0 = 0; static cl_bool l_linker_available = false; -#define check_error(errCode,msg,...) ((errCode != CL_SUCCESS) ? (log_error("ERROR: " msg "! (%s:%d)\n", ## __VA_ARGS__, __FILE__, __LINE__), 1) : 0) +#define check_error(errCode, msg, ...) \ + ((errCode != CL_SUCCESS) ? (log_error("ERROR: " msg "! (%s:%d)\n", \ + ##__VA_ARGS__, __FILE__, __LINE__), \ + 1) \ + : 0) //////////////////// // Info about types we can use for program scope variables. @@ -75,110 +80,135 @@ static cl_bool l_linker_available = false; class TypeInfo { public: - TypeInfo() : - name(""), - m_buf_elem_type(""), - m_is_vecbase(false), - m_is_atomic(false), - m_is_like_size_t(false), - m_is_bool(false), - m_elem_type(0), m_num_elem(0), - m_size(0), - m_value_size(0) - {} - TypeInfo(const char* name_arg) : - name(name_arg), - m_buf_elem_type(name_arg), - m_is_vecbase(false), - m_is_atomic(false), - m_is_like_size_t(false), - m_is_bool(false), - m_elem_type(0), m_num_elem(0), - m_size(0), - m_value_size(0) - { } + TypeInfo() + : name(""), m_buf_elem_type(""), m_is_vecbase(false), + m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false), + m_elem_type(0), m_num_elem(0), m_size(0), m_value_size(0) + {} + TypeInfo(const char* name_arg) + : name(name_arg), m_buf_elem_type(name_arg), m_is_vecbase(false), + m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false), + m_elem_type(0), m_num_elem(0), m_size(0), m_value_size(0) + {} // Vectors - TypeInfo( TypeInfo* elem_type, int num_elem ) : - m_is_vecbase(false), - m_is_atomic(false), - m_is_like_size_t(false), - m_is_bool(false), - m_elem_type(elem_type), - m_num_elem(num_elem) - { - char the_name[10]; // long enough for longest vector type name "double16" - snprintf(the_name,sizeof(the_name),"%s%d",elem_type->get_name_c_str(),m_num_elem); + TypeInfo(TypeInfo* elem_type, int num_elem) + : m_is_vecbase(false), m_is_atomic(false), m_is_like_size_t(false), + m_is_bool(false), m_elem_type(elem_type), m_num_elem(num_elem) + { + char + the_name[10]; // long enough for longest vector type name "double16" + snprintf(the_name, sizeof(the_name), "%s%d", + elem_type->get_name_c_str(), m_num_elem); this->name = std::string(the_name); this->m_buf_elem_type = std::string(the_name); this->m_value_size = num_elem * elem_type->get_size(); - if ( m_num_elem == 3 ) { + if (m_num_elem == 3) + { this->m_size = 4 * elem_type->get_size(); - } else { + } + else + { this->m_size = num_elem * elem_type->get_size(); } } const std::string& get_name(void) const { return name; } const char* get_name_c_str(void) const { return name.c_str(); } - TypeInfo& set_vecbase(void) { this->m_is_vecbase = true; return *this; } - TypeInfo& set_atomic(void) { this->m_is_atomic = true; return *this; } - TypeInfo& set_like_size_t(void) { + TypeInfo& set_vecbase(void) + { + this->m_is_vecbase = true; + return *this; + } + TypeInfo& set_atomic(void) + { + this->m_is_atomic = true; + return *this; + } + TypeInfo& set_like_size_t(void) + { this->m_is_like_size_t = true; - this->set_size( l_64bit_device ? 8 : 4 ); + this->set_size(l_64bit_device ? 8 : 4); this->m_buf_elem_type = l_64bit_device ? "ulong" : "uint"; return *this; } - TypeInfo& set_bool(void) { this->m_is_bool = true; return *this; } - TypeInfo& set_size(size_t n) { this->m_value_size = this->m_size = n; return *this; } - TypeInfo& set_buf_elem_type( const char* name ) { this->m_buf_elem_type = std::string(name); return *this; } + TypeInfo& set_bool(void) + { + this->m_is_bool = true; + return *this; + } + TypeInfo& set_size(size_t n) + { + this->m_value_size = this->m_size = n; + return *this; + } + TypeInfo& set_buf_elem_type(const char* name) + { + this->m_buf_elem_type = std::string(name); + return *this; + } const TypeInfo* elem_type(void) const { return m_elem_type; } int num_elem(void) const { return m_num_elem; } - bool is_vecbase(void) const {return m_is_vecbase;} - bool is_atomic(void) const {return m_is_atomic;} - bool is_atomic_64bit(void) const {return m_is_atomic && m_size == 8;} - bool is_like_size_t(void) const {return m_is_like_size_t;} - bool is_bool(void) const {return m_is_bool;} - size_t get_size(void) const {return m_size;} - size_t get_value_size(void) const {return m_value_size;} + bool is_vecbase(void) const { return m_is_vecbase; } + bool is_atomic(void) const { return m_is_atomic; } + bool is_atomic_64bit(void) const { return m_is_atomic && m_size == 8; } + bool is_like_size_t(void) const { return m_is_like_size_t; } + bool is_bool(void) const { return m_is_bool; } + size_t get_size(void) const { return m_size; } + size_t get_value_size(void) const { return m_value_size; } // When passing values of this type to a kernel, what buffer type // should be used? - const char* get_buf_elem_type(void) const { return m_buf_elem_type.c_str(); } + const char* get_buf_elem_type(void) const + { + return m_buf_elem_type.c_str(); + } - std::string as_string(const cl_uchar* value_ptr) const { + std::string as_string(const cl_uchar* value_ptr) const + { // This method would be shorter if I had a real handle to element // vector type. - if ( this->is_bool() ) { - std::string result( name ); + if (this->is_bool()) + { + std::string result(name); result += "<"; result += (*value_ptr ? "true" : "false"); result += ", "; char buf[10]; - sprintf(buf,"%02x",*value_ptr); + sprintf(buf, "%02x", *value_ptr); result += buf; result += ">"; return result; - } else if ( this->num_elem() ) { - std::string result( name ); + } + else if (this->num_elem()) + { + std::string result(name); result += "<"; - for ( unsigned ielem = 0 ; ielem < this->num_elem() ; ielem++ ) { + for (unsigned ielem = 0; ielem < this->num_elem(); ielem++) + { char buf[MAX_STR]; - if ( ielem ) result += ", "; - for ( unsigned ibyte = 0; ibyte < this->m_elem_type->get_size() ; ibyte++ ) { - sprintf(buf + 2*ibyte,"%02x", value_ptr[ ielem * this->m_elem_type->get_size() + ibyte ] ); + if (ielem) result += ", "; + for (unsigned ibyte = 0; ibyte < this->m_elem_type->get_size(); + ibyte++) + { + sprintf(buf + 2 * ibyte, "%02x", + value_ptr[ielem * this->m_elem_type->get_size() + + ibyte]); } result += buf; } result += ">"; return result; - } else { - std::string result( name ); + } + else + { + std::string result(name); result += "<"; char buf[MAX_STR]; - for ( unsigned ibyte = 0; ibyte < this->get_size() ; ibyte++ ) { - sprintf(buf + 2*ibyte,"%02x", value_ptr[ ibyte ] ); + for (unsigned ibyte = 0; ibyte < this->get_size(); ibyte++) + { + sprintf(buf + 2 * ibyte, "%02x", value_ptr[ibyte]); } result += buf; result += ">"; @@ -189,51 +219,71 @@ public: // Initialize the given buffer to a constant value initialized as if it // were from the INIT_VAR macro below. // Only needs to support values 0 and 1. - void init( cl_uchar* buf, cl_uchar val) const { - if ( this->num_elem() ) { - for ( unsigned ielem = 0 ; ielem < this->num_elem() ; ielem++ ) { + void init(cl_uchar* buf, cl_uchar val) const + { + if (this->num_elem()) + { + for (unsigned ielem = 0; ielem < this->num_elem(); ielem++) + { // Delegate! - this->init_elem( buf + ielem * this->get_value_size()/this->num_elem(), val ); + this->init_elem( + buf + ielem * this->get_value_size() / this->num_elem(), + val); } - } else { - init_elem( buf, val ); + } + else + { + init_elem(buf, val); } } private: - void init_elem( cl_uchar* buf, cl_uchar val ) const { - size_t elem_size = this->num_elem() ? this->get_value_size()/this->num_elem() : this->get_size(); - memset(buf,0,elem_size); - if ( val ) { - if ( strstr( name.c_str(), "float" ) ) { + void init_elem(cl_uchar* buf, cl_uchar val) const + { + size_t elem_size = this->num_elem() + ? this->get_value_size() / this->num_elem() + : this->get_size(); + memset(buf, 0, elem_size); + if (val) + { + if (strstr(name.c_str(), "float")) + { *(float*)buf = (float)val; return; } - if ( strstr( name.c_str(), "double" ) ) { + if (strstr(name.c_str(), "double")) + { *(double*)buf = (double)val; return; } - if ( this->is_bool() ) { *buf = (bool)val; return; } + if (this->is_bool()) + { + *buf = (bool)val; + return; + } // Write a single character value to the correct spot, // depending on host endianness. - if ( l_host_is_big_endian ) *(buf + elem_size-1) = (cl_uchar)val; - else *buf = (cl_uchar)val; + if (l_host_is_big_endian) + *(buf + elem_size - 1) = (cl_uchar)val; + else + *buf = (cl_uchar)val; } } -public: - void dump(FILE* fp) const { - fprintf(fp,"Type %s : <%d,%d,%s> ", name.c_str(), - (int)m_size, - (int)m_value_size, - m_buf_elem_type.c_str() ); - if ( this->m_elem_type ) fprintf(fp, " vec(%s,%d)", this->m_elem_type->get_name_c_str(), this->num_elem() ); - if ( this->m_is_vecbase ) fprintf(fp, " vecbase"); - if ( this->m_is_bool ) fprintf(fp, " bool"); - if ( this->m_is_like_size_t ) fprintf(fp, " like-size_t"); - if ( this->m_is_atomic ) fprintf(fp, " atomic"); - fprintf(fp,"\n"); +public: + void dump(FILE* fp) const + { + fprintf(fp, "Type %s : <%d,%d,%s> ", name.c_str(), (int)m_size, + (int)m_value_size, m_buf_elem_type.c_str()); + if (this->m_elem_type) + fprintf(fp, " vec(%s,%d)", this->m_elem_type->get_name_c_str(), + this->num_elem()); + if (this->m_is_vecbase) fprintf(fp, " vecbase"); + if (this->m_is_bool) fprintf(fp, " bool"); + if (this->m_is_like_size_t) fprintf(fp, " like-size_t"); + if (this->m_is_atomic) fprintf(fp, " atomic"); + fprintf(fp, "\n"); fflush(fp); } @@ -246,7 +296,8 @@ private: bool m_is_like_size_t; bool m_is_bool; size_t m_size; // Number of bytes of storage occupied by this type. - size_t m_value_size; // Number of bytes of value significant for this type. Differs for vec3. + size_t m_value_size; // Number of bytes of value significant for this type. + // Differs for vec3. // When passing values of this type to a kernel, what buffer type // should be used? @@ -256,46 +307,65 @@ private: }; -#define NUM_SCALAR_TYPES (8+2) // signed and unsigned integral types, float and double -#define NUM_VECTOR_SIZES (5) // 2,3,4,8,16 -#define NUM_PLAIN_TYPES \ - 5 /*boolean and size_t family */ \ - + NUM_SCALAR_TYPES \ - + NUM_SCALAR_TYPES*NUM_VECTOR_SIZES \ - + 10 /* atomic types */ +#define NUM_SCALAR_TYPES \ + (8 + 2) // signed and unsigned integral types, float and double +#define NUM_VECTOR_SIZES (5) // 2,3,4,8,16 +#define NUM_PLAIN_TYPES \ + 5 /*boolean and size_t family */ \ + + NUM_SCALAR_TYPES + NUM_SCALAR_TYPES* NUM_VECTOR_SIZES \ + + 10 /* atomic types */ // Need room for plain, array, pointer, struct -#define MAX_TYPES (4*NUM_PLAIN_TYPES) +#define MAX_TYPES (4 * NUM_PLAIN_TYPES) static TypeInfo type_info[MAX_TYPES]; static int num_type_info = 0; // Number of valid entries in type_info[] - - // A helper class to form kernel source arguments for clCreateProgramWithSource. class StringTable { public: - StringTable() : m_c_strs(NULL), m_lengths(NULL), m_frozen(false), m_strings() {} + StringTable(): m_c_strs(NULL), m_lengths(NULL), m_frozen(false), m_strings() + {} ~StringTable() { release_frozen(); } - void add(std::string s) { release_frozen(); m_strings.push_back(s); } + void add(std::string s) + { + release_frozen(); + m_strings.push_back(s); + } - const size_t num_str() { freeze(); return m_strings.size(); } - const char** strs() { freeze(); return m_c_strs; } - const size_t* lengths() { freeze(); return m_lengths; } + const size_t num_str() + { + freeze(); + return m_strings.size(); + } + const char** strs() + { + freeze(); + return m_c_strs; + } + const size_t* lengths() + { + freeze(); + return m_lengths; + } private: - void freeze(void) { - if ( !m_frozen ) { + void freeze(void) + { + if (!m_frozen) + { release_frozen(); - m_c_strs = (const char**) malloc(sizeof(const char*) * m_strings.size()); - m_lengths = (size_t*) malloc(sizeof(size_t) * m_strings.size()); - assert( m_c_strs ); - assert( m_lengths ); + m_c_strs = + (const char**)malloc(sizeof(const char*) * m_strings.size()); + m_lengths = (size_t*)malloc(sizeof(size_t) * m_strings.size()); + assert(m_c_strs); + assert(m_lengths); - for ( size_t i = 0; i < m_strings.size() ; i++ ) { + for (size_t i = 0; i < m_strings.size(); i++) + { m_c_strs[i] = m_strings[i].c_str(); m_lengths[i] = strlen(m_c_strs[i]); } @@ -303,9 +373,18 @@ private: m_frozen = true; } } - void release_frozen(void) { - if ( m_c_strs ) { free(m_c_strs); m_c_strs = 0; } - if ( m_lengths ) { free(m_lengths); m_lengths = 0; } + void release_frozen(void) + { + if (m_c_strs) + { + free(m_c_strs); + m_c_strs = 0; + } + if (m_lengths) + { + free(m_lengths); + m_lengths = 0; + } m_frozen = false; } @@ -325,11 +404,15 @@ static const char* l_get_fp64_pragma(void); static const char* l_get_cles_int64_pragma(void); static int l_build_type_table(cl_device_id device); -static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret); +static int l_get_device_info(cl_device_id device, size_t* max_size_ret, + size_t* pref_size_ret); -static void l_set_randomly( cl_uchar* buf, size_t buf_size, RandomSeed& rand_state ); -static int l_compare( const cl_uchar* expected, const cl_uchar* received, unsigned num_values, const TypeInfo&ti ); -static int l_copy( cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo&ti ); +static void l_set_randomly(cl_uchar* buf, size_t buf_size, + RandomSeed& rand_state); +static int l_compare(const cl_uchar* expected, const cl_uchar* received, + unsigned num_values, const TypeInfo& ti); +static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, + unsigned src_idx, const TypeInfo& ti); static std::string conversion_functions(const TypeInfo& ti); static std::string global_decls(const TypeInfo& ti, bool with_init); @@ -337,90 +420,123 @@ static std::string global_check_function(const TypeInfo& ti); static std::string writer_function(const TypeInfo& ti); static std::string reader_function(const TypeInfo& ti); -static int l_write_read( cl_device_id device, cl_context context, cl_command_queue queue ); -static int l_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state ); +static int l_write_read(cl_device_id device, cl_context context, + cl_command_queue queue); +static int l_write_read_for_type(cl_device_id device, cl_context context, + cl_command_queue queue, const TypeInfo& ti, + RandomSeed& rand_state); -static int l_init_write_read( cl_device_id device, cl_context context, cl_command_queue queue ); -static int l_init_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state ); - -static int l_capacity( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size ); -static int l_user_type( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size, bool separate_compilation ); +static int l_init_write_read(cl_device_id device, cl_context context, + cl_command_queue queue); +static int l_init_write_read_for_type(cl_device_id device, cl_context context, + cl_command_queue queue, + const TypeInfo& ti, + RandomSeed& rand_state); +static int l_capacity(cl_device_id device, cl_context context, + cl_command_queue queue, size_t max_size); +static int l_user_type(cl_device_id device, cl_context context, + cl_command_queue queue, size_t max_size, + bool separate_compilation); //////////////////// // File scope function definitions -static cl_int print_build_log(cl_program program, cl_uint num_devices, cl_device_id *device_list, cl_uint count, const char **strings, const size_t *lengths, const char* options) +static cl_int print_build_log(cl_program program, cl_uint num_devices, + cl_device_id* device_list, cl_uint count, + const char** strings, const size_t* lengths, + const char* options) { cl_uint i; cl_int error; BufferOwningPtr<cl_device_id> devices; - if(num_devices == 0 || device_list == NULL) + if (num_devices == 0 || device_list == NULL) { - error = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(num_devices), &num_devices, NULL); + error = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, + sizeof(num_devices), &num_devices, NULL); test_error(error, "clGetProgramInfo CL_PROGRAM_NUM_DEVICES failed"); - device_list = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices); + device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices); devices.reset(device_list); memset(device_list, 0, sizeof(cl_device_id) * num_devices); - error = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * num_devices, device_list, NULL); + error = clGetProgramInfo(program, CL_PROGRAM_DEVICES, + sizeof(cl_device_id) * num_devices, + device_list, NULL); test_error(error, "clGetProgramInfo CL_PROGRAM_DEVICES failed"); } cl_uint z; bool sourcePrinted = false; - for(z = 0; z < num_devices; z++) + for (z = 0; z < num_devices; z++) { char deviceName[4096] = ""; - error = clGetDeviceInfo(device_list[z], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); - check_error(error, "Device \"%d\" failed to return a name. clGetDeviceInfo CL_DEVICE_NAME failed", z); + error = clGetDeviceInfo(device_list[z], CL_DEVICE_NAME, + sizeof(deviceName), deviceName, NULL); + check_error(error, + "Device \"%d\" failed to return a name. clGetDeviceInfo " + "CL_DEVICE_NAME failed", + z); cl_build_status buildStatus; - error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_STATUS, sizeof(buildStatus), &buildStatus, NULL); - check_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_STATUS failed"); + error = clGetProgramBuildInfo(program, device_list[z], + CL_PROGRAM_BUILD_STATUS, + sizeof(buildStatus), &buildStatus, NULL); + check_error(error, + "clGetProgramBuildInfo CL_PROGRAM_BUILD_STATUS failed"); - if(buildStatus != CL_BUILD_SUCCESS) + if (buildStatus != CL_BUILD_SUCCESS) { - if(!sourcePrinted) + if (!sourcePrinted) { log_error("Build options: %s\n", options); - if(count && strings) + if (count && strings) { log_error("Original source is: ------------\n"); - for(i = 0; i < count; i++) log_error("%s", strings[i]); + for (i = 0; i < count; i++) log_error("%s", strings[i]); } sourcePrinted = true; } char statusString[64] = ""; if (buildStatus == (cl_build_status)CL_BUILD_SUCCESS) - sprintf(statusString, "CL_BUILD_SUCCESS"); + sprintf(statusString, "CL_BUILD_SUCCESS"); else if (buildStatus == (cl_build_status)CL_BUILD_NONE) - sprintf(statusString, "CL_BUILD_NONE"); + sprintf(statusString, "CL_BUILD_NONE"); else if (buildStatus == (cl_build_status)CL_BUILD_ERROR) - sprintf(statusString, "CL_BUILD_ERROR"); + sprintf(statusString, "CL_BUILD_ERROR"); else if (buildStatus == (cl_build_status)CL_BUILD_IN_PROGRESS) - sprintf(statusString, "CL_BUILD_IN_PROGRESS"); + sprintf(statusString, "CL_BUILD_IN_PROGRESS"); else - sprintf(statusString, "UNKNOWN (%d)", buildStatus); + sprintf(statusString, "UNKNOWN (%d)", buildStatus); - log_error("Build not successful for device \"%s\", status: %s\n", deviceName, statusString); + log_error("Build not successful for device \"%s\", status: %s\n", + deviceName, statusString); size_t paramSize = 0; - error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, 0, NULL, ¶mSize); - if(check_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed")) break; + error = clGetProgramBuildInfo(program, device_list[z], + CL_PROGRAM_BUILD_LOG, 0, NULL, + ¶mSize); + if (check_error( + error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed")) + break; std::string log; - log.resize(paramSize/sizeof(char)); - - error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, paramSize, &log[0], NULL); - if(check_error(error, "Device %d (%s) failed to return a build log", z, deviceName)) break; - if(log[0] == 0) log_error("clGetProgramBuildInfo returned an empty log.\n"); + log.resize(paramSize / sizeof(char)); + + error = clGetProgramBuildInfo(program, device_list[z], + CL_PROGRAM_BUILD_LOG, paramSize, + &log[0], NULL); + if (check_error(error, + "Device %d (%s) failed to return a build log", z, + deviceName)) + break; + if (log[0] == 0) + log_error("clGetProgramBuildInfo returned an empty log.\n"); else { log_error("Build log:\n", deviceName); @@ -433,25 +549,29 @@ static cl_int print_build_log(cl_program program, cl_uint num_devices, cl_device static void l_load_abilities(cl_device_id device) { - l_has_half = is_extension_available(device,"cl_khr_fp16"); - l_has_double = is_extension_available(device,"cl_khr_fp64"); - l_has_cles_int64 = is_extension_available(device,"cles_khr_int64"); + l_has_half = is_extension_available(device, "cl_khr_fp16"); + l_has_double = is_extension_available(device, "cl_khr_fp64"); + l_has_cles_int64 = is_extension_available(device, "cles_khr_int64"); - l_has_int64_atomics - = is_extension_available(device,"cl_khr_int64_base_atomics") - && is_extension_available(device,"cl_khr_int64_extended_atomics"); + l_has_int64_atomics = + is_extension_available(device, "cl_khr_int64_base_atomics") + && is_extension_available(device, "cl_khr_int64_extended_atomics"); { int status = CL_SUCCESS; cl_uint addr_bits = 32; - status = clGetDeviceInfo(device,CL_DEVICE_ADDRESS_BITS,sizeof(addr_bits),&addr_bits,0); - l_64bit_device = ( status == CL_SUCCESS && addr_bits == 64 ); + status = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, + sizeof(addr_bits), &addr_bits, 0); + l_64bit_device = (status == CL_SUCCESS && addr_bits == 64); } // 32-bit devices always have intptr atomics. l_has_intptr_atomics = !l_64bit_device || l_has_int64_atomics; - union { char c[4]; int i; } probe; + union { + char c[4]; + int i; + } probe; probe.i = 1; l_host_is_big_endian = !probe.c[0]; @@ -459,33 +579,40 @@ static void l_load_abilities(cl_device_id device) { int status = CL_SUCCESS; cl_uint max_dim = 0; - status = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,sizeof(max_dim),&max_dim,0); - assert( status == CL_SUCCESS ); - assert( max_dim > 0 ); + status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, + sizeof(max_dim), &max_dim, 0); + assert(status == CL_SUCCESS); + assert(max_dim > 0); size_t max_id[3]; max_id[0] = 0; - status = clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES,max_dim*sizeof(size_t),&max_id[0],0); - assert( status == CL_SUCCESS ); + status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, + max_dim * sizeof(size_t), &max_id[0], 0); + assert(status == CL_SUCCESS); l_max_global_id0 = max_id[0]; } { // Is separate compilation supported? int status = CL_SUCCESS; l_linker_available = false; - status = clGetDeviceInfo(device,CL_DEVICE_LINKER_AVAILABLE,sizeof(l_linker_available),&l_linker_available,0); - assert( status == CL_SUCCESS ); + status = + clGetDeviceInfo(device, CL_DEVICE_LINKER_AVAILABLE, + sizeof(l_linker_available), &l_linker_available, 0); + assert(status == CL_SUCCESS); } } static const char* l_get_fp64_pragma(void) { - return l_has_double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" : ""; + return l_has_double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" + : ""; } static const char* l_get_cles_int64_pragma(void) { - return l_has_cles_int64 ? "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n" : ""; + return l_has_cles_int64 + ? "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n" + : ""; } static const char* l_get_int64_atomic_pragma(void) @@ -500,89 +627,83 @@ static int l_build_type_table(cl_device_id device) size_t iscalar = 0; size_t ivecsize = 0; int vecsizes[] = { 2, 3, 4, 8, 16 }; - const char* vecbase[] = { - "uchar", "char", - "ushort", "short", - "uint", "int", - "ulong", "long", - "float", - "double" - }; - int vecbase_size[] = { - 1, 1, - 2, 2, - 4, 4, - 8, 8, - 4, - 8 - }; - const char* like_size_t[] = { - "intptr_t", - "uintptr_t", - "size_t", - "ptrdiff_t" - }; + const char* vecbase[] = { "uchar", "char", "ushort", "short", "uint", + "int", "ulong", "long", "float", "double" }; + int vecbase_size[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 }; + const char* like_size_t[] = { "intptr_t", "uintptr_t", "size_t", + "ptrdiff_t" }; const char* atomics[] = { - "atomic_int", "atomic_uint", - "atomic_long", "atomic_ulong", - "atomic_float", - "atomic_double", - }; - int atomics_size[] = { - 4, 4, - 8, 8, - 4, - 8 - }; - const char* intptr_atomics[] = { - "atomic_intptr_t", - "atomic_uintptr_t", - "atomic_size_t", - "atomic_ptrdiff_t" + "atomic_int", "atomic_uint", "atomic_long", + "atomic_ulong", "atomic_float", "atomic_double", }; + int atomics_size[] = { 4, 4, 8, 8, 4, 8 }; + const char* intptr_atomics[] = { "atomic_intptr_t", "atomic_uintptr_t", + "atomic_size_t", "atomic_ptrdiff_t" }; l_load_abilities(device); num_type_info = 0; // Boolean. - type_info[ num_type_info++ ] = TypeInfo( "bool" ).set_bool().set_size(1).set_buf_elem_type("uchar"); + type_info[num_type_info++] = + TypeInfo("bool").set_bool().set_size(1).set_buf_elem_type("uchar"); // Vector types, and the related scalar element types. - for ( iscalar=0; iscalar < sizeof(vecbase)/sizeof(vecbase[0]) ; ++iscalar ) { - if ( !gHasLong && strstr(vecbase[iscalar],"long") ) continue; - if ( !l_has_double && strstr(vecbase[iscalar],"double") ) continue; + for (iscalar = 0; iscalar < sizeof(vecbase) / sizeof(vecbase[0]); ++iscalar) + { + if (!gHasLong && strstr(vecbase[iscalar], "long")) continue; + if (!l_has_double && strstr(vecbase[iscalar], "double")) continue; // Scalar TypeInfo* elem_type = type_info + num_type_info++; - *elem_type = TypeInfo( vecbase[iscalar] ).set_vecbase().set_size( vecbase_size[iscalar] ); + *elem_type = TypeInfo(vecbase[iscalar]) + .set_vecbase() + .set_size(vecbase_size[iscalar]); // Vector - for ( ivecsize=0; ivecsize < sizeof(vecsizes)/sizeof(vecsizes[0]) ; ivecsize++ ) { - type_info[ num_type_info++ ] = TypeInfo( elem_type, vecsizes[ivecsize] ); + for (ivecsize = 0; ivecsize < sizeof(vecsizes) / sizeof(vecsizes[0]); + ivecsize++) + { + type_info[num_type_info++] = + TypeInfo(elem_type, vecsizes[ivecsize]); } } // Size_t-like types - for ( iscalar=0; iscalar < sizeof(like_size_t)/sizeof(like_size_t[0]) ; ++iscalar ) { - type_info[ num_type_info++ ] = TypeInfo( like_size_t[iscalar] ).set_like_size_t(); + for (iscalar = 0; iscalar < sizeof(like_size_t) / sizeof(like_size_t[0]); + ++iscalar) + { + type_info[num_type_info++] = + TypeInfo(like_size_t[iscalar]).set_like_size_t(); } // Atomic types. - for ( iscalar=0; iscalar < sizeof(atomics)/sizeof(atomics[0]) ; ++iscalar ) { - if ( !l_has_int64_atomics && strstr(atomics[iscalar],"long") ) continue; - if ( !(l_has_int64_atomics && l_has_double) && strstr(atomics[iscalar],"double") ) continue; + for (iscalar = 0; iscalar < sizeof(atomics) / sizeof(atomics[0]); ++iscalar) + { + if (!l_has_int64_atomics && strstr(atomics[iscalar], "long")) continue; + if (!(l_has_int64_atomics && l_has_double) + && strstr(atomics[iscalar], "double")) + continue; // The +7 is used to skip over the "atomic_" prefix. const char* buf_type = atomics[iscalar] + 7; - type_info[ num_type_info++ ] = TypeInfo( atomics[iscalar] ).set_atomic().set_size( atomics_size[iscalar] ).set_buf_elem_type( buf_type ); + type_info[num_type_info++] = TypeInfo(atomics[iscalar]) + .set_atomic() + .set_size(atomics_size[iscalar]) + .set_buf_elem_type(buf_type); } - if ( l_has_intptr_atomics ) { - for ( iscalar=0; iscalar < sizeof(intptr_atomics)/sizeof(intptr_atomics[0]) ; ++iscalar ) { - type_info[ num_type_info++ ] = TypeInfo( intptr_atomics[iscalar] ).set_atomic().set_like_size_t(); + if (l_has_intptr_atomics) + { + for (iscalar = 0; + iscalar < sizeof(intptr_atomics) / sizeof(intptr_atomics[0]); + ++iscalar) + { + type_info[num_type_info++] = TypeInfo(intptr_atomics[iscalar]) + .set_atomic() + .set_like_size_t(); } } - assert( num_type_info <= MAX_TYPES ); // or increase MAX_TYPES + assert(num_type_info <= MAX_TYPES); // or increase MAX_TYPES #if 0 for ( size_t i = 0 ; i < num_type_info ; i++ ) { @@ -594,7 +715,7 @@ static int l_build_type_table(cl_device_id device) return status; } -static const TypeInfo& l_find_type( const char* name ) +static const TypeInfo& l_find_type(const char* name) { auto itr = std::find_if(type_info, type_info + num_type_info, @@ -604,36 +725,54 @@ static const TypeInfo& l_find_type( const char* name ) } +// Populate return parameters for max program variable size, preferred program +// variable size. -// Populate return parameters for max program variable size, preferred program variable size. - -static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret) +static int l_get_device_info(cl_device_id device, size_t* max_size_ret, + size_t* pref_size_ret) { int err = CL_SUCCESS; size_t return_size = 0; - err = clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, sizeof(*max_size_ret), max_size_ret, &return_size); - if ( err != CL_SUCCESS ) { - log_error("Error: Failed to get device info for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n"); + err = clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, + sizeof(*max_size_ret), max_size_ret, &return_size); + if (err != CL_SUCCESS) + { + log_error("Error: Failed to get device info for " + "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n"); return err; } - if ( return_size != sizeof(size_t) ) { - log_error("Error: Invalid size %d returned for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size ); + if (return_size != sizeof(size_t)) + { + log_error("Error: Invalid size %d returned for " + "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", + (int)return_size); return 1; } - if ( return_size != sizeof(size_t) ) { - log_error("Error: Invalid size %d returned for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size ); + if (return_size != sizeof(size_t)) + { + log_error("Error: Invalid size %d returned for " + "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", + (int)return_size); return 1; } return_size = 0; - err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, sizeof(*pref_size_ret), pref_size_ret, &return_size); - if ( err != CL_SUCCESS ) { - log_error("Error: Failed to get device info for CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: %d\n",err); + err = + clGetDeviceInfo(device, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, + sizeof(*pref_size_ret), pref_size_ret, &return_size); + if (err != CL_SUCCESS) + { + log_error("Error: Failed to get device info for " + "CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: %d\n", + err); return err; } - if ( return_size != sizeof(size_t) ) { - log_error("Error: Invalid size %d returned for CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE\n", (int)return_size ); + if (return_size != sizeof(size_t)) + { + log_error("Error: Invalid size %d returned for " + "CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE\n", + (int)return_size); return 1; } @@ -641,11 +780,13 @@ static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* } -static void l_set_randomly( cl_uchar* buf, size_t buf_size, RandomSeed& rand_state ) +static void l_set_randomly(cl_uchar* buf, size_t buf_size, + RandomSeed& rand_state) { - assert( 0 == (buf_size % sizeof(cl_uint) ) ); - for ( size_t i = 0; i < buf_size ; i += sizeof(cl_uint) ) { - *( (cl_uint*)(buf + i) ) = genrand_int32( rand_state ); + assert(0 == (buf_size % sizeof(cl_uint))); + for (size_t i = 0; i < buf_size; i += sizeof(cl_uint)) + { + *((cl_uint*)(buf + i)) = genrand_int32(rand_state); } #if 0 for ( size_t i = 0; i < buf_size ; i++ ) { @@ -657,20 +798,23 @@ static void l_set_randomly( cl_uchar* buf, size_t buf_size, RandomSeed& rand_sta // Return num_value values of the given type. // Returns CL_SUCCESS if they compared as equal. -static int l_compare( const char* test_name, const cl_uchar* expected, const cl_uchar* received, size_t num_values, const TypeInfo&ti ) +static int l_compare(const char* test_name, const cl_uchar* expected, + const cl_uchar* received, size_t num_values, + const TypeInfo& ti) { // Compare only the valid returned bytes. - for ( unsigned value_idx = 0; value_idx < num_values; value_idx++ ) { + for (unsigned value_idx = 0; value_idx < num_values; value_idx++) + { const cl_uchar* expv = expected + value_idx * ti.get_size(); const cl_uchar* gotv = received + value_idx * ti.get_size(); - if ( memcmp( expv, gotv, ti.get_value_size() ) ) { - std::string exp_str = ti.as_string( expv ); - std::string got_str = ti.as_string( gotv ); - log_error("Error: %s test for type %s, at index %d: Expected %s got %s\n", - test_name, - ti.get_name_c_str(), value_idx, - exp_str.c_str(), - got_str.c_str() ); + if (memcmp(expv, gotv, ti.get_value_size())) + { + std::string exp_str = ti.as_string(expv); + std::string got_str = ti.as_string(gotv); + log_error( + "Error: %s test for type %s, at index %d: Expected %s got %s\n", + test_name, ti.get_name_c_str(), value_idx, exp_str.c_str(), + got_str.c_str()); return 1; } } @@ -678,11 +822,12 @@ static int l_compare( const char* test_name, const cl_uchar* expected, const cl_ } // Copy a target value from src[idx] to dest[idx] -static int l_copy( cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo&ti ) +static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, + unsigned src_idx, const TypeInfo& ti) { - cl_uchar* raw_dest = dest + dest_idx * ti.get_size(); - const cl_uchar* raw_src = src + src_idx * ti.get_size(); - memcpy( raw_dest, raw_src, ti.get_value_size() ); + cl_uchar* raw_dest = dest + dest_idx * ti.get_size(); + const cl_uchar* raw_src = src + src_idx * ti.get_size(); + memcpy(raw_dest, raw_src, ti.get_value_size()); return 0; } @@ -694,59 +839,70 @@ static std::string conversion_functions(const TypeInfo& ti) static char buf[MAX_STR]; int num_printed = 0; // The atomic types just use the base type. - if ( ti.is_atomic() || 0 == strcmp( ti.get_buf_elem_type(), ti.get_name_c_str() ) ) { + if (ti.is_atomic() + || 0 == strcmp(ti.get_buf_elem_type(), ti.get_name_c_str())) + { // The type is represented in a buffer by itself. - num_printed = snprintf(buf,MAX_STR, - "%s from_buf(%s a) { return a; }\n" - "%s to_buf(%s a) { return a; }\n", - ti.get_buf_elem_type(), ti.get_buf_elem_type(), - ti.get_buf_elem_type(), ti.get_buf_elem_type() ); - } else { + num_printed = snprintf(buf, MAX_STR, + "%s from_buf(%s a) { return a; }\n" + "%s to_buf(%s a) { return a; }\n", + ti.get_buf_elem_type(), ti.get_buf_elem_type(), + ti.get_buf_elem_type(), ti.get_buf_elem_type()); + } + else + { // Just use C-style cast. - num_printed = snprintf(buf,MAX_STR, - "%s from_buf(%s a) { return (%s)a; }\n" - "%s to_buf(%s a) { return (%s)a; }\n", - ti.get_name_c_str(), ti.get_buf_elem_type(), ti.get_name_c_str(), - ti.get_buf_elem_type(), ti.get_name_c_str(), ti.get_buf_elem_type() ); + num_printed = snprintf(buf, MAX_STR, + "%s from_buf(%s a) { return (%s)a; }\n" + "%s to_buf(%s a) { return (%s)a; }\n", + ti.get_name_c_str(), ti.get_buf_elem_type(), + ti.get_name_c_str(), ti.get_buf_elem_type(), + ti.get_name_c_str(), ti.get_buf_elem_type()); } // Add initializations. - if ( ti.is_atomic() ) { - num_printed += snprintf( buf + num_printed, MAX_STR-num_printed, - "#define INIT_VAR(a) ATOMIC_VAR_INIT(a)\n" ); - } else { + if (ti.is_atomic()) + { + num_printed += snprintf(buf + num_printed, MAX_STR - num_printed, + "#define INIT_VAR(a) ATOMIC_VAR_INIT(a)\n"); + } + else + { // This cast works even if the target type is a vector type. - num_printed += snprintf( buf + num_printed, MAX_STR-num_printed, - "#define INIT_VAR(a) ((%s)(a))\n", ti.get_name_c_str()); + num_printed += + snprintf(buf + num_printed, MAX_STR - num_printed, + "#define INIT_VAR(a) ((%s)(a))\n", ti.get_name_c_str()); } - assert( num_printed < MAX_STR ); // or increase MAX_STR + assert(num_printed < MAX_STR); // or increase MAX_STR result = buf; return result; } -static std::string global_decls(const TypeInfo& ti, bool with_init ) +static std::string global_decls(const TypeInfo& ti, bool with_init) { const char* tn = ti.get_name_c_str(); const char* vol = (ti.is_atomic() ? " volatile " : " "); static char decls[MAX_STR]; int num_printed = 0; - if ( with_init ) { - const char *decls_template_with_init = + if (with_init) + { + const char* decls_template_with_init = "%s %s var = INIT_VAR(0);\n" "global %s %s g_var = INIT_VAR(1);\n" "%s %s a_var[2] = { INIT_VAR(1), INIT_VAR(1) };\n" "volatile global %s %s* p_var = &a_var[1];\n\n"; - num_printed = snprintf(decls,sizeof(decls),decls_template_with_init, - vol,tn,vol,tn,vol,tn,vol,tn); - } else { - const char *decls_template_no_init = - "%s %s var;\n" - "global %s %s g_var;\n" - "%s %s a_var[2];\n" - "global %s %s* p_var;\n\n"; - num_printed = snprintf(decls,sizeof(decls),decls_template_no_init, - vol,tn,vol,tn,vol,tn,vol,tn); - } - assert( num_printed < sizeof(decls) ); + num_printed = snprintf(decls, sizeof(decls), decls_template_with_init, + vol, tn, vol, tn, vol, tn, vol, tn); + } + else + { + const char* decls_template_no_init = "%s %s var;\n" + "global %s %s g_var;\n" + "%s %s a_var[2];\n" + "global %s %s* p_var;\n\n"; + num_printed = snprintf(decls, sizeof(decls), decls_template_no_init, + vol, tn, vol, tn, vol, tn, vol, tn); + } + assert(num_printed < sizeof(decls)); return std::string(decls); } @@ -761,18 +917,26 @@ static std::string global_check_function(const TypeInfo& ti) // all() should only be used on vector inputs. For scalar comparison, the // result of the equality operator can be used as a bool value. - const bool is_scalar = ti.num_elem() == 0; // 0 is used to represent scalar types, not 1. + const bool is_scalar = + ti.num_elem() == 0; // 0 is used to represent scalar types, not 1. const std::string is_equality_true = is_scalar ? "" : "all"; std::string code = "kernel void global_check(global int* out) {\n"; code += " const " + type_name + " zero = ((" + type_name + ")0);\n"; code += " bool status = true;\n"; - if (ti.is_atomic()) { - code += " status &= " + is_equality_true + "(atomic_load(&var) == zero);\n"; - code += " status &= " + is_equality_true + "(atomic_load(&g_var) == zero);\n"; - code += " status &= " + is_equality_true + "(atomic_load(&a_var[0]) == zero);\n"; - code += " status &= " + is_equality_true + "(atomic_load(&a_var[1]) == zero);\n"; - } else { + if (ti.is_atomic()) + { + code += " status &= " + is_equality_true + + "(atomic_load(&var) == zero);\n"; + code += " status &= " + is_equality_true + + "(atomic_load(&g_var) == zero);\n"; + code += " status &= " + is_equality_true + + "(atomic_load(&a_var[0]) == zero);\n"; + code += " status &= " + is_equality_true + + "(atomic_load(&a_var[1]) == zero);\n"; + } + else + { code += " status &= " + is_equality_true + "(var == zero);\n"; code += " status &= " + is_equality_true + "(g_var == zero);\n"; code += " status &= " + is_equality_true + "(a_var[0] == zero);\n"; @@ -792,7 +956,8 @@ static std::string writer_function(const TypeInfo& ti) { static char writer_src[MAX_STR]; int num_printed = 0; - if ( !ti.is_atomic() ) { + if (!ti.is_atomic()) + { const char* writer_template_normal = "kernel void writer( global %s* src, uint idx ) {\n" " var = from_buf(src[0]);\n" @@ -801,8 +966,11 @@ static std::string writer_function(const TypeInfo& ti) " a_var[1] = from_buf(src[3]);\n" " p_var = a_var + idx;\n" "}\n\n"; - num_printed = snprintf(writer_src,sizeof(writer_src),writer_template_normal,ti.get_buf_elem_type()); - } else { + num_printed = snprintf(writer_src, sizeof(writer_src), + writer_template_normal, ti.get_buf_elem_type()); + } + else + { const char* writer_template_atomic = "kernel void writer( global %s* src, uint idx ) {\n" " atomic_store( &var, from_buf(src[0]) );\n" @@ -811,9 +979,10 @@ static std::string writer_function(const TypeInfo& ti) " atomic_store( &a_var[1], from_buf(src[3]) );\n" " p_var = a_var + idx;\n" "}\n\n"; - num_printed = snprintf(writer_src,sizeof(writer_src),writer_template_atomic,ti.get_buf_elem_type()); + num_printed = snprintf(writer_src, sizeof(writer_src), + writer_template_atomic, ti.get_buf_elem_type()); } - assert( num_printed < sizeof(writer_src) ); + assert(num_printed < sizeof(writer_src)); std::string result = writer_src; return result; } @@ -826,7 +995,8 @@ static std::string reader_function(const TypeInfo& ti) { static char reader_src[MAX_STR]; int num_printed = 0; - if ( !ti.is_atomic() ) { + if (!ti.is_atomic()) + { const char* reader_template_normal = "kernel void reader( global %s* dest, %s ptr_write_val ) {\n" " *p_var = from_buf(ptr_write_val);\n" @@ -835,8 +1005,12 @@ static std::string reader_function(const TypeInfo& ti) " dest[2] = to_buf(a_var[0]);\n" " dest[3] = to_buf(a_var[1]);\n" "}\n\n"; - num_printed = snprintf(reader_src,sizeof(reader_src),reader_template_normal,ti.get_buf_elem_type(),ti.get_buf_elem_type()); - } else { + num_printed = + snprintf(reader_src, sizeof(reader_src), reader_template_normal, + ti.get_buf_elem_type(), ti.get_buf_elem_type()); + } + else + { const char* reader_template_atomic = "kernel void reader( global %s* dest, %s ptr_write_val ) {\n" " atomic_store( p_var, from_buf(ptr_write_val) );\n" @@ -845,40 +1019,53 @@ static std::string reader_function(const TypeInfo& ti) " dest[2] = to_buf( atomic_load( &a_var[0] ) );\n" " dest[3] = to_buf( atomic_load( &a_var[1] ) );\n" "}\n\n"; - num_printed = snprintf(reader_src,sizeof(reader_src),reader_template_atomic,ti.get_buf_elem_type(),ti.get_buf_elem_type()); + num_printed = + snprintf(reader_src, sizeof(reader_src), reader_template_atomic, + ti.get_buf_elem_type(), ti.get_buf_elem_type()); } - assert( num_printed < sizeof(reader_src) ); + assert(num_printed < sizeof(reader_src)); std::string result = reader_src; return result; } // Check that all globals where appropriately default-initialized. -static int check_global_initialization(cl_context context, cl_program program, cl_command_queue queue) +static int check_global_initialization(cl_context context, cl_program program, + cl_command_queue queue) { int status = CL_SUCCESS; // Create a buffer on device to store a unique integer. cl_int is_init_valid = 0; - clMemWrapper buffer(clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(is_init_valid), &is_init_valid, &status)); + clMemWrapper buffer( + clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(is_init_valid), &is_init_valid, &status)); test_error_ret(status, "Failed to allocate buffer", status); // Create, setup and invoke kernel. - clKernelWrapper global_check(clCreateKernel(program, "global_check", &status)); + clKernelWrapper global_check( + clCreateKernel(program, "global_check", &status)); test_error_ret(status, "Failed to create global_check kernel", status); status = clSetKernelArg(global_check, 0, sizeof(cl_mem), &buffer); - test_error_ret(status, "Failed to set up argument for the global_check kernel", status); + test_error_ret(status, + "Failed to set up argument for the global_check kernel", + status); const cl_uint work_dim = 1; const size_t global_work_offset[] = { 0 }; const size_t global_work_size[] = { 1 }; - status = clEnqueueNDRangeKernel(queue, global_check, work_dim, global_work_offset, global_work_size, nullptr, 0, nullptr, nullptr); + status = clEnqueueNDRangeKernel(queue, global_check, work_dim, + global_work_offset, global_work_size, + nullptr, 0, nullptr, nullptr); test_error_ret(status, "Failed to run global_check kernel", status); status = clFinish(queue); test_error_ret(status, "clFinish() failed", status); // Read back the memory buffer from the device. - status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(is_init_valid), &is_init_valid, 0, nullptr, nullptr); + status = + clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(is_init_valid), + &is_init_valid, 0, nullptr, nullptr); test_error_ret(status, "Failed to read buffer from device", status); - if (is_init_valid == 0) { + if (is_init_valid == 0) + { log_error("Unexpected default values were detected"); return 1; } @@ -887,58 +1074,75 @@ static int check_global_initialization(cl_context context, cl_program program, c } // Check write-then-read. -static int l_write_read( cl_device_id device, cl_context context, cl_command_queue queue ) +static int l_write_read(cl_device_id device, cl_context context, + cl_command_queue queue) { int status = CL_SUCCESS; int itype; - RandomSeed rand_state( gRandomSeed ); + RandomSeed rand_state(gRandomSeed); - for ( itype = 0; itype < num_type_info ; itype++ ) { - status = status | l_write_read_for_type(device,context,queue,type_info[itype], rand_state ); + for (itype = 0; itype < num_type_info; itype++) + { + status = status + | l_write_read_for_type(device, context, queue, type_info[itype], + rand_state); FLUSH; } return status; } -static int l_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state ) +static int l_write_read_for_type(cl_device_id device, cl_context context, + cl_command_queue queue, const TypeInfo& ti, + RandomSeed& rand_state) { int err = CL_SUCCESS; - std::string type_name( ti.get_name() ); + std::string type_name(ti.get_name()); const char* tn = type_name.c_str(); - log_info(" %s ",tn); + log_info(" %s ", tn); StringTable ksrc; - ksrc.add( l_get_fp64_pragma() ); - ksrc.add( l_get_cles_int64_pragma() ); - if (ti.is_atomic_64bit()) - ksrc.add( l_get_int64_atomic_pragma() ); - ksrc.add( conversion_functions(ti) ); - ksrc.add( global_decls(ti,false) ); - ksrc.add( global_check_function(ti) ); - ksrc.add( writer_function(ti) ); - ksrc.add( reader_function(ti) ); + ksrc.add(l_get_fp64_pragma()); + ksrc.add(l_get_cles_int64_pragma()); + if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma()); + ksrc.add(conversion_functions(ti)); + ksrc.add(global_decls(ti, false)); + ksrc.add(global_check_function(ti)); + ksrc.add(writer_function(ti)); + ksrc.add(reader_function(ti)); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper writer; - status = create_single_kernel_helper_with_build_options(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", OPTIONS); - test_error_ret(status,"Failed to create program for read-after-write test",status); + status = create_single_kernel_helper_with_build_options( + context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", + OPTIONS); + test_error_ret(status, "Failed to create program for read-after-write test", + status); - clKernelWrapper reader( clCreateKernel( program, "reader", &status ) ); - test_error_ret(status,"Failed to create reader kernel for read-after-write test",status); + clKernelWrapper reader(clCreateKernel(program, "reader", &status)); + test_error_ret(status, + "Failed to create reader kernel for read-after-write test", + status); // Check size query. size_t used_bytes = 0; - status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 ); - test_error_ret(status,"Failed to query global variable total size",status); - size_t expected_used_bytes = - (NUM_TESTED_VALUES-1)*ti.get_size() // Two regular variables and an array of 2 elements. - + ( l_64bit_device ? 8 : 4 ); // The pointer - if ( used_bytes < expected_used_bytes ) { - log_error("Error program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes ); + status = clGetProgramBuildInfo(program, device, + CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, + sizeof(used_bytes), &used_bytes, 0); + test_error_ret(status, "Failed to query global variable total size", + status); + size_t expected_used_bytes = (NUM_TESTED_VALUES - 1) + * ti.get_size() // Two regular variables and an array of 2 elements. + + (l_64bit_device ? 8 : 4); // The pointer + if (used_bytes < expected_used_bytes) + { + log_error("Error program query for global variable total size query " + "failed: Expected at least %llu but got %llu\n", + (unsigned long long)expected_used_bytes, + (unsigned long long)used_bytes); err |= 1; } @@ -951,90 +1155,131 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT); cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT); - clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) ); - test_error_ret(status,"Failed to allocate write buffer",status); - clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) ); - test_error_ret(status,"Failed to allocate read buffer",status); + clMemWrapper write_mem(clCreateBuffer( + context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status)); + test_error_ret(status, "Failed to allocate write buffer", status); + clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + read_data_size, read_data, &status)); + test_error_ret(status, "Failed to allocate read buffer", status); - status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status); - status = clSetKernelArg(reader,0,sizeof(cl_mem),&read_mem); test_error_ret(status,"set arg",status); + status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem); + test_error_ret(status, "set arg", status); // Boolean random data needs to be massaged a bit more. - const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES ) : NUM_ROUNDS; + const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS; unsigned bool_iter = 0; - for ( int iround = 0; iround < num_rounds ; iround++ ) { - for ( cl_uint iptr_idx = 0; iptr_idx < 2 ; iptr_idx++ ) { // Index into array, to write via pointer + for (int iround = 0; iround < num_rounds; iround++) + { + for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++) + { // Index into array, to write via pointer // Generate new random data to push through. - // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that. + // Generate 5 * 128 bytes all the time, even though the test for + // many types use less than all that. - cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0); + cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, + 0, 0, 0); - if ( ti.is_bool() ) { + if (ti.is_bool()) + { // For boolean, random data cast to bool isn't very random. // So use the bottom bit of bool_value_iter to get true // diversity. - for ( unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES ; value_idx++ ) { - write_data[value_idx] = (1<<value_idx) & bool_iter; - //printf(" %s", (write_data[value_idx] ? "true" : "false" )); + for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES; + value_idx++) + { + write_data[value_idx] = (1 << value_idx) & bool_iter; + // printf(" %s", (write_data[value_idx] ? "true" : "false" + // )); } bool_iter++; - } else { - l_set_randomly( write_data, write_data_size, rand_state ); } - status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status); + else + { + l_set_randomly(write_data, write_data_size, rand_state); + } + status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx); + test_error_ret(status, "set arg", status); // The value to write via the pointer should be taken from the // 5th typed slot of the write_data. - status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status); + status = clSetKernelArg( + reader, 1, ti.get_size(), + write_data + (NUM_TESTED_VALUES - 1) * ti.get_size()); + test_error_ret(status, "set arg", status); // Determine the expected values. cl_uchar expected[read_data_size]; - memset( expected, -1, sizeof(expected) ); - l_copy( expected, 0, write_data, 0, ti ); - l_copy( expected, 1, write_data, 1, ti ); - l_copy( expected, 2, write_data, 2, ti ); - l_copy( expected, 3, write_data, 3, ti ); - // But we need to take into account the value from the pointer write. - // The 2 represents where the "a" array values begin in our read-back. - l_copy( expected, 2 + iptr_idx, write_data, 4, ti ); + memset(expected, -1, sizeof(expected)); + l_copy(expected, 0, write_data, 0, ti); + l_copy(expected, 1, write_data, 1, ti); + l_copy(expected, 2, write_data, 2, ti); + l_copy(expected, 3, write_data, 3, ti); + // But we need to take into account the value from the pointer + // write. The 2 represents where the "a" array values begin in our + // read-back. + l_copy(expected, 2 + iptr_idx, write_data, 4, ti); clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0); - if ( ti.is_bool() ) { + if (ti.is_bool()) + { // Collapse down to one bit. - for ( unsigned i = 0; i < NUM_TESTED_VALUES-1 ; i++ ) expected[i] = (bool)expected[i]; + for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) + expected[i] = (bool)expected[i]; } - cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); + cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, + 0, 0); memset(read_data, -1, read_data_size); clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); // Now run the kernel const size_t one = 1; - status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status); - status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status); - status = clFinish(queue); test_error_ret(status,"finish",status); - - read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); - - if ( ti.is_bool() ) { + status = + clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue writer", status); + status = + clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue reader", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); + + read_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, + 0, 0); + + if (ti.is_bool()) + { // Collapse down to one bit. - for ( unsigned i = 0; i < NUM_TESTED_VALUES-1 ; i++ ) read_data[i] = (bool)read_data[i]; + for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) + read_data[i] = (bool)read_data[i]; } // Compare only the valid returned bytes. - int compare_result = l_compare( "read-after-write", expected, read_data, NUM_TESTED_VALUES-1, ti ); - // log_info("Compared %d values each of size %llu. Result %d\n", NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), compare_result ); + int compare_result = + l_compare("read-after-write", expected, read_data, + NUM_TESTED_VALUES - 1, ti); + // log_info("Compared %d values each of size %llu. Result %d\n", + // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), + // compare_result ); err |= compare_result; clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); - if ( err ) break; + if (err) break; } } - if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; } + if (CL_SUCCESS == err) + { + log_info("OK\n"); + FLUSH; + } align_free(write_data); align_free(read_data); return err; @@ -1042,74 +1287,97 @@ static int l_write_read_for_type( cl_device_id device, cl_context context, cl_co // Check initialization, then, read, then write, then read. -static int l_init_write_read( cl_device_id device, cl_context context, cl_command_queue queue ) +static int l_init_write_read(cl_device_id device, cl_context context, + cl_command_queue queue) { int status = CL_SUCCESS; int itype; - RandomSeed rand_state( gRandomSeed ); + RandomSeed rand_state(gRandomSeed); - for ( itype = 0; itype < num_type_info ; itype++ ) { - status = status | l_init_write_read_for_type(device,context,queue,type_info[itype], rand_state ); + for (itype = 0; itype < num_type_info; itype++) + { + status = status + | l_init_write_read_for_type(device, context, queue, + type_info[itype], rand_state); } return status; } -static int l_init_write_read_for_type( cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state ) +static int l_init_write_read_for_type(cl_device_id device, cl_context context, + cl_command_queue queue, + const TypeInfo& ti, + RandomSeed& rand_state) { int err = CL_SUCCESS; - std::string type_name( ti.get_name() ); + std::string type_name(ti.get_name()); const char* tn = type_name.c_str(); - log_info(" %s ",tn); + log_info(" %s ", tn); StringTable ksrc; - ksrc.add( l_get_fp64_pragma() ); - ksrc.add( l_get_cles_int64_pragma() ); - if (ti.is_atomic_64bit()) - ksrc.add( l_get_int64_atomic_pragma() ); - ksrc.add( conversion_functions(ti) ); - ksrc.add( global_decls(ti,true) ); - ksrc.add( writer_function(ti) ); - ksrc.add( reader_function(ti) ); + ksrc.add(l_get_fp64_pragma()); + ksrc.add(l_get_cles_int64_pragma()); + if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma()); + ksrc.add(conversion_functions(ti)); + ksrc.add(global_decls(ti, true)); + ksrc.add(writer_function(ti)); + ksrc.add(reader_function(ti)); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper writer; - status = create_single_kernel_helper_with_build_options(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", OPTIONS); - test_error_ret(status,"Failed to create program for init-read-after-write test",status); + status = create_single_kernel_helper_with_build_options( + context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer", + OPTIONS); + test_error_ret(status, + "Failed to create program for init-read-after-write test", + status); - clKernelWrapper reader( clCreateKernel( program, "reader", &status ) ); - test_error_ret(status,"Failed to create reader kernel for init-read-after-write test",status); + clKernelWrapper reader(clCreateKernel(program, "reader", &status)); + test_error_ret( + status, "Failed to create reader kernel for init-read-after-write test", + status); // Check size query. size_t used_bytes = 0; - status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 ); - test_error_ret(status,"Failed to query global variable total size",status); - size_t expected_used_bytes = - (NUM_TESTED_VALUES-1)*ti.get_size() // Two regular variables and an array of 2 elements. - + ( l_64bit_device ? 8 : 4 ); // The pointer - if ( used_bytes < expected_used_bytes ) { - log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes ); + status = clGetProgramBuildInfo(program, device, + CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, + sizeof(used_bytes), &used_bytes, 0); + test_error_ret(status, "Failed to query global variable total size", + status); + size_t expected_used_bytes = (NUM_TESTED_VALUES - 1) + * ti.get_size() // Two regular variables and an array of 2 elements. + + (l_64bit_device ? 8 : 4); // The pointer + if (used_bytes < expected_used_bytes) + { + log_error("Error: program query for global variable total size query " + "failed: Expected at least %llu but got %llu\n", + (unsigned long long)expected_used_bytes, + (unsigned long long)used_bytes); err |= 1; } // We need to create 5 random values of the given type, // and read 4 of them back. const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16); - const size_t read_data_size = (NUM_TESTED_VALUES-1) * sizeof(cl_ulong16); + const size_t read_data_size = (NUM_TESTED_VALUES - 1) * sizeof(cl_ulong16); cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT); cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT); - clMemWrapper write_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status ) ); - test_error_ret(status,"Failed to allocate write buffer",status); - clMemWrapper read_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status ) ); - test_error_ret(status,"Failed to allocate read buffer",status); - - status = clSetKernelArg(writer,0,sizeof(cl_mem),&write_mem); test_error_ret(status,"set arg",status); - status = clSetKernelArg(reader,0,sizeof(cl_mem),&read_mem); test_error_ret(status,"set arg",status); + clMemWrapper write_mem(clCreateBuffer( + context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status)); + test_error_ret(status, "Failed to allocate write buffer", status); + clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + read_data_size, read_data, &status)); + test_error_ret(status, "Failed to allocate read buffer", status); + + status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem); + test_error_ret(status, "set arg", status); // Boolean random data needs to be massaged a bit more. - const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES ) : NUM_ROUNDS; + const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS; unsigned bool_iter = 0; // We need to count iterations. We do something *different on the @@ -1117,107 +1385,152 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context, // values. unsigned iteration = 0; - for ( int iround = 0; iround < num_rounds ; iround++ ) { - for ( cl_uint iptr_idx = 0; iptr_idx < 2 ; iptr_idx++ ) { // Index into array, to write via pointer + for (int iround = 0; iround < num_rounds; iround++) + { + for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++) + { // Index into array, to write via pointer // Generate new random data to push through. - // Generate 5 * 128 bytes all the time, even though the test for many types use less than all that. + // Generate 5 * 128 bytes all the time, even though the test for + // many types use less than all that. - cl_uchar *write_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0); + cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, + 0, 0, 0); - if ( ti.is_bool() ) { + if (ti.is_bool()) + { // For boolean, random data cast to bool isn't very random. // So use the bottom bit of bool_value_iter to get true // diversity. - for ( unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES ; value_idx++ ) { - write_data[value_idx] = (1<<value_idx) & bool_iter; - //printf(" %s", (write_data[value_idx] ? "true" : "false" )); + for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES; + value_idx++) + { + write_data[value_idx] = (1 << value_idx) & bool_iter; + // printf(" %s", (write_data[value_idx] ? "true" : "false" + // )); } bool_iter++; - } else { - l_set_randomly( write_data, write_data_size, rand_state ); } - status = clSetKernelArg(writer,1,sizeof(cl_uint),&iptr_idx); test_error_ret(status,"set arg",status); + else + { + l_set_randomly(write_data, write_data_size, rand_state); + } + status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx); + test_error_ret(status, "set arg", status); - if ( !iteration ) { + if (!iteration) + { // On first iteration, the value we write via the last arg // to the "reader" function is 0. // It's way easier to code the test this way. - ti.init( write_data + (NUM_TESTED_VALUES-1)*ti.get_size(), 0 ); + ti.init(write_data + (NUM_TESTED_VALUES - 1) * ti.get_size(), + 0); } // The value to write via the pointer should be taken from the // 5th typed slot of the write_data. - status = clSetKernelArg(reader,1,ti.get_size(),write_data + (NUM_TESTED_VALUES-1)*ti.get_size()); test_error_ret(status,"set arg",status); + status = clSetKernelArg( + reader, 1, ti.get_size(), + write_data + (NUM_TESTED_VALUES - 1) * ti.get_size()); + test_error_ret(status, "set arg", status); // Determine the expected values. cl_uchar expected[read_data_size]; - memset( expected, -1, sizeof(expected) ); - if ( iteration ) { - l_copy( expected, 0, write_data, 0, ti ); - l_copy( expected, 1, write_data, 1, ti ); - l_copy( expected, 2, write_data, 2, ti ); - l_copy( expected, 3, write_data, 3, ti ); - // But we need to take into account the value from the pointer write. - // The 2 represents where the "a" array values begin in our read-back. - // But we need to take into account the value from the pointer write. - l_copy( expected, 2 + iptr_idx, write_data, 4, ti ); - } else { + memset(expected, -1, sizeof(expected)); + if (iteration) + { + l_copy(expected, 0, write_data, 0, ti); + l_copy(expected, 1, write_data, 1, ti); + l_copy(expected, 2, write_data, 2, ti); + l_copy(expected, 3, write_data, 3, ti); + // But we need to take into account the value from the pointer + // write. The 2 represents where the "a" array values begin in + // our read-back. But we need to take into account the value + // from the pointer write. + l_copy(expected, 2 + iptr_idx, write_data, 4, ti); + } + else + { // On first iteration, expect these initialized values! // See the decls_template_with_init above. - ti.init( expected, 0 ); - ti.init( expected + ti.get_size(), 1 ); - ti.init( expected + 2*ti.get_size(), 1 ); + ti.init(expected, 0); + ti.init(expected + ti.get_size(), 1); + ti.init(expected + 2 * ti.get_size(), 1); // Emulate the effect of the write via the pointer. // The value is 0, not 1 (see above). // The pointer is always initialized to the second element // of the array. So it goes into slot 3 of the "expected" array. - ti.init( expected + 3*ti.get_size(), 0 ); + ti.init(expected + 3 * ti.get_size(), 0); } - if ( ti.is_bool() ) { + if (ti.is_bool()) + { // Collapse down to one bit. - for ( unsigned i = 0; i < NUM_TESTED_VALUES-1 ; i++ ) expected[i] = (bool)expected[i]; + for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) + expected[i] = (bool)expected[i]; } clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0); - cl_uchar *read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); - memset( read_data, -1, read_data_size ); + cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, + 0, 0); + memset(read_data, -1, read_data_size); clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); // Now run the kernel const size_t one = 1; - if ( iteration ) { - status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status); - } else { + if (iteration) + { + status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, + 0, 0); + test_error_ret(status, "enqueue writer", status); + } + else + { // On first iteration, we should be picking up the // initialized value. So don't enqueue the writer. } - status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status); - status = clFinish(queue); test_error_ret(status,"finish",status); + status = + clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue reader", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); - read_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); + read_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, + 0, 0); - if ( ti.is_bool() ) { + if (ti.is_bool()) + { // Collapse down to one bit. - for ( unsigned i = 0; i < NUM_TESTED_VALUES-1 ; i++ ) read_data[i] = (bool)read_data[i]; + for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) + read_data[i] = (bool)read_data[i]; } // Compare only the valid returned bytes. - //log_info(" Round %d ptr_idx %u\n", iround, iptr_idx ); - int compare_result = l_compare( "init-write-read", expected, read_data, NUM_TESTED_VALUES-1, ti ); - //log_info("Compared %d values each of size %llu. Result %d\n", NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), compare_result ); + // log_info(" Round %d ptr_idx %u\n", iround, iptr_idx ); + int compare_result = + l_compare("init-write-read", expected, read_data, + NUM_TESTED_VALUES - 1, ti); + // log_info("Compared %d values each of size %llu. Result %d\n", + // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), + // compare_result ); err |= compare_result; clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); - if ( err ) break; + if (err) break; iteration++; } } - if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; } + if (CL_SUCCESS == err) + { + log_info("OK\n"); + FLUSH; + } align_free(write_data); align_free(read_data); @@ -1226,12 +1539,14 @@ static int l_init_write_read_for_type( cl_device_id device, cl_context context, // Check that we can make at least one variable with size -// max_size which is returned from the device info property : CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE. -static int l_capacity( cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size ) +// max_size which is returned from the device info property : +// CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE. +static int l_capacity(cl_device_id device, cl_context context, + cl_command_queue queue, size_t max_size) { int err = CL_SUCCESS; // Just test one type. - const TypeInfo ti( l_find_type("uchar") ); + const TypeInfo ti(l_find_type("uchar")); log_info(" l_capacity..."); const char prog_src_template[] = @@ -1254,83 +1569,132 @@ static int l_capacity( cl_device_id device, cl_context context, cl_command_queue " dest[get_global_linear_id()] = var[get_global_id(0)];\n" "}\n\n"; char prog_src[MAX_STR]; - int num_printed = snprintf(prog_src,sizeof(prog_src),prog_src_template,max_size, max_size); - assert( num_printed < MAX_STR ); // or increase MAX_STR + int num_printed = snprintf(prog_src, sizeof(prog_src), prog_src_template, + max_size, max_size); + assert(num_printed < MAX_STR); // or increase MAX_STR + (void)num_printed; StringTable ksrc; - ksrc.add( prog_src ); + ksrc.add(prog_src); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper get_max_size; - status = create_single_kernel_helper_with_build_options(context, &program, &get_max_size, ksrc.num_str(), ksrc.strs(), "get_max_size", OPTIONS); - test_error_ret(status,"Failed to create program for capacity test",status); + status = create_single_kernel_helper_with_build_options( + context, &program, &get_max_size, ksrc.num_str(), ksrc.strs(), + "get_max_size", OPTIONS); + test_error_ret(status, "Failed to create program for capacity test", + status); // Check size query. size_t used_bytes = 0; - status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 ); - test_error_ret(status,"Failed to query global variable total size",status); - if ( used_bytes < max_size ) { - log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)max_size, (unsigned long long)used_bytes ); + status = clGetProgramBuildInfo(program, device, + CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, + sizeof(used_bytes), &used_bytes, 0); + test_error_ret(status, "Failed to query global variable total size", + status); + if (used_bytes < max_size) + { + log_error("Error: program query for global variable total size query " + "failed: Expected at least %llu but got %llu\n", + (unsigned long long)max_size, (unsigned long long)used_bytes); err |= 1; } // Prepare to execute - clKernelWrapper writer( clCreateKernel( program, "writer", &status ) ); - test_error_ret(status,"Failed to create writer kernel for capacity test",status); - clKernelWrapper reader( clCreateKernel( program, "reader", &status ) ); - test_error_ret(status,"Failed to create reader kernel for capacity test",status); + clKernelWrapper writer(clCreateKernel(program, "writer", &status)); + test_error_ret(status, "Failed to create writer kernel for capacity test", + status); + clKernelWrapper reader(clCreateKernel(program, "reader", &status)); + test_error_ret(status, "Failed to create reader kernel for capacity test", + status); cl_ulong max_size_ret = 0; - const size_t arr_size = 10*1024*1024; - cl_uchar* buffer = (cl_uchar*) align_malloc( arr_size, ALIGNMENT ); + const size_t arr_size = 10 * 1024 * 1024; + cl_uchar* buffer = (cl_uchar*)align_malloc(arr_size, ALIGNMENT); - if ( !buffer ) { log_error("Failed to allocate buffer\n"); return 1; } - - clMemWrapper max_size_ret_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(max_size_ret), &max_size_ret, &status ) ); - test_error_ret(status,"Failed to allocate size query buffer",status); - clMemWrapper buffer_mem( clCreateBuffer( context, CL_MEM_READ_WRITE, arr_size, 0, &status ) ); - test_error_ret(status,"Failed to allocate write buffer",status); + if (!buffer) + { + log_error("Failed to allocate buffer\n"); + return 1; + } - status = clSetKernelArg(get_max_size,0,sizeof(cl_mem),&max_size_ret_mem); test_error_ret(status,"set arg",status); - status = clSetKernelArg(writer,0,sizeof(cl_mem),&buffer_mem); test_error_ret(status,"set arg",status); - status = clSetKernelArg(reader,0,sizeof(cl_mem),&buffer_mem); test_error_ret(status,"set arg",status); + clMemWrapper max_size_ret_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + sizeof(max_size_ret), + &max_size_ret, &status)); + test_error_ret(status, "Failed to allocate size query buffer", status); + clMemWrapper buffer_mem( + clCreateBuffer(context, CL_MEM_READ_WRITE, arr_size, 0, &status)); + test_error_ret(status, "Failed to allocate write buffer", status); + + status = clSetKernelArg(get_max_size, 0, sizeof(cl_mem), &max_size_ret_mem); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(writer, 0, sizeof(cl_mem), &buffer_mem); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(reader, 0, sizeof(cl_mem), &buffer_mem); + test_error_ret(status, "set arg", status); // Check the macro value of CL_DEVICE_MAX_GLOBAL_VARIABLE const size_t one = 1; - status = clEnqueueNDRangeKernel(queue,get_max_size,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue size query",status); - status = clFinish(queue); test_error_ret(status,"finish",status); - - cl_uchar *max_size_ret_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, max_size_ret_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(max_size_ret), 0, 0, 0, 0); - if ( max_size_ret != max_size ) { - log_error("Error: preprocessor definition for CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE is %llu and does not match device query value %llu\n", - (unsigned long long) max_size_ret, - (unsigned long long) max_size ); + status = + clEnqueueNDRangeKernel(queue, get_max_size, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue size query", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); + + cl_uchar* max_size_ret_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, max_size_ret_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(max_size_ret), + 0, 0, 0, 0); + if (max_size_ret != max_size) + { + log_error("Error: preprocessor definition for " + "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE is %llu and does not " + "match device query value %llu\n", + (unsigned long long)max_size_ret, + (unsigned long long)max_size); err |= 1; } clEnqueueUnmapMemObject(queue, max_size_ret_mem, max_size_ret_ptr, 0, 0, 0); - RandomSeed rand_state_write( gRandomSeed ); - for ( size_t offset = 0; offset < max_size ; offset += arr_size ) { - size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size; - l_set_randomly( buffer, curr_size, rand_state_write ); - status = clEnqueueWriteBuffer (queue, buffer_mem, CL_TRUE, 0, curr_size, buffer, 0, 0, 0);test_error_ret(status,"populate buffer_mem object",status); - status = clEnqueueNDRangeKernel(queue,writer,1,&offset,&curr_size,0,0,0,0); test_error_ret(status,"enqueue writer",status); - status = clFinish(queue); test_error_ret(status,"finish",status); - } - - RandomSeed rand_state_read( gRandomSeed ); - for ( size_t offset = 0; offset < max_size ; offset += arr_size ) { - size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size; - status = clEnqueueNDRangeKernel(queue,reader,1,&offset,&curr_size,0,0,0,0); test_error_ret(status,"enqueue reader",status); - cl_uchar* read_mem_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, buffer_mem, CL_TRUE, CL_MAP_READ, 0, curr_size, 0, 0, 0, &status);test_error_ret(status,"map read data",status); - l_set_randomly( buffer, curr_size, rand_state_read ); - err |= l_compare( "capacity", buffer, read_mem_ptr, curr_size, ti ); + RandomSeed rand_state_write(gRandomSeed); + for (size_t offset = 0; offset < max_size; offset += arr_size) + { + size_t curr_size = + (max_size - offset) < arr_size ? (max_size - offset) : arr_size; + l_set_randomly(buffer, curr_size, rand_state_write); + status = clEnqueueWriteBuffer(queue, buffer_mem, CL_TRUE, 0, curr_size, + buffer, 0, 0, 0); + test_error_ret(status, "populate buffer_mem object", status); + status = clEnqueueNDRangeKernel(queue, writer, 1, &offset, &curr_size, + 0, 0, 0, 0); + test_error_ret(status, "enqueue writer", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); + } + + RandomSeed rand_state_read(gRandomSeed); + for (size_t offset = 0; offset < max_size; offset += arr_size) + { + size_t curr_size = + (max_size - offset) < arr_size ? (max_size - offset) : arr_size; + status = clEnqueueNDRangeKernel(queue, reader, 1, &offset, &curr_size, + 0, 0, 0, 0); + test_error_ret(status, "enqueue reader", status); + cl_uchar* read_mem_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, buffer_mem, CL_TRUE, CL_MAP_READ, 0, curr_size, 0, 0, 0, + &status); + test_error_ret(status, "map read data", status); + l_set_randomly(buffer, curr_size, rand_state_read); + err |= l_compare("capacity", buffer, read_mem_ptr, curr_size, ti); clEnqueueUnmapMemObject(queue, buffer_mem, read_mem_ptr, 0, 0, 0); } - if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; } + if (CL_SUCCESS == err) + { + log_info("OK\n"); + FLUSH; + } align_free(buffer); return err; @@ -1338,32 +1702,33 @@ static int l_capacity( cl_device_id device, cl_context context, cl_command_queue // Check operation on a user type. -static int l_user_type( cl_device_id device, cl_context context, cl_command_queue queue, bool separate_compile ) +static int l_user_type(cl_device_id device, cl_context context, + cl_command_queue queue, bool separate_compile) { int err = CL_SUCCESS; // Just test one type. - const TypeInfo ti( l_find_type("uchar") ); - log_info(" l_user_type %s...", separate_compile ? "separate compilation" : "single source compilation" ); + const TypeInfo ti(l_find_type("uchar")); + log_info(" l_user_type %s...", + separate_compile ? "separate compilation" + : "single source compilation"); - if ( separate_compile && ! l_linker_available ) { + if (separate_compile && !l_linker_available) + { log_info("Separate compilation is not supported. Skipping test\n"); return err; } const char type_src[] = "typedef struct { uchar c; uint i; } my_struct_t;\n\n"; - const char def_src[] = - "my_struct_t var = { 'a', 42 };\n\n"; - const char decl_src[] = - "extern my_struct_t var;\n\n"; + const char def_src[] = "my_struct_t var = { 'a', 42 };\n\n"; + const char decl_src[] = "extern my_struct_t var;\n\n"; // Don't use a host struct. We can't guarantee that the host // compiler has the same structure layout as the device compiler. - const char writer_src[] = - "kernel void writer( uchar c, uint i ) {\n" - " var.c = c;\n" - " var.i = i;\n" - "}\n\n"; + const char writer_src[] = "kernel void writer( uchar c, uint i ) {\n" + " var.c = c;\n" + " var.i = i;\n" + "}\n\n"; const char reader_src[] = "kernel void reader( global uchar* C, global uint* I ) {\n" " *C = var.c;\n" @@ -1372,36 +1737,53 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu clProgramWrapper program; - if ( separate_compile ) { + if (separate_compile) + { // Separate compilation flow. StringTable wksrc; - wksrc.add( type_src ); - wksrc.add( def_src ); - wksrc.add( writer_src ); + wksrc.add(type_src); + wksrc.add(def_src); + wksrc.add(writer_src); StringTable rksrc; - rksrc.add( type_src ); - rksrc.add( decl_src ); - rksrc.add( reader_src ); + rksrc.add(type_src); + rksrc.add(decl_src); + rksrc.add(reader_src); int status = CL_SUCCESS; - clProgramWrapper writer_program( clCreateProgramWithSource( context, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), &status ) ); - test_error_ret(status,"Failed to create writer program for user type test",status); - - status = clCompileProgram( writer_program, 1, &device, OPTIONS, 0, 0, 0, 0, 0 ); - if(check_error(status, "Failed to compile writer program for user type test (%s)", IGetErrorString(status))) + clProgramWrapper writer_program(clCreateProgramWithSource( + context, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), &status)); + test_error_ret(status, + "Failed to create writer program for user type test", + status); + + status = clCompileProgram(writer_program, 1, &device, OPTIONS, 0, 0, 0, + 0, 0); + if (check_error( + status, + "Failed to compile writer program for user type test (%s)", + IGetErrorString(status))) { - print_build_log(writer_program, 1, &device, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), OPTIONS); + print_build_log(writer_program, 1, &device, wksrc.num_str(), + wksrc.strs(), wksrc.lengths(), OPTIONS); return status; } - clProgramWrapper reader_program( clCreateProgramWithSource( context, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), &status ) ); - test_error_ret(status,"Failed to create reader program for user type test",status); - - status = clCompileProgram( reader_program, 1, &device, OPTIONS, 0, 0, 0, 0, 0 ); - if(check_error(status, "Failed to compile reader program for user type test (%s)", IGetErrorString(status))) + clProgramWrapper reader_program(clCreateProgramWithSource( + context, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), &status)); + test_error_ret(status, + "Failed to create reader program for user type test", + status); + + status = clCompileProgram(reader_program, 1, &device, OPTIONS, 0, 0, 0, + 0, 0); + if (check_error( + status, + "Failed to compile reader program for user type test (%s)", + IGetErrorString(status))) { - print_build_log(reader_program, 1, &device, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), OPTIONS); + print_build_log(reader_program, 1, &device, rksrc.num_str(), + rksrc.strs(), rksrc.lengths(), OPTIONS); return status; } @@ -1409,33 +1791,45 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu progs[0] = writer_program; progs[1] = reader_program; - program = clLinkProgram( context, 1, &device, "", 2, progs, 0, 0, &status ); - if(check_error(status, "Failed to link program for user type test (%s)", IGetErrorString(status))) + program = + clLinkProgram(context, 1, &device, "", 2, progs, 0, 0, &status); + if (check_error(status, + "Failed to link program for user type test (%s)", + IGetErrorString(status))) { print_build_log(program, 1, &device, 0, NULL, NULL, ""); return status; } - } else { + } + else + { // Single compilation flow. StringTable ksrc; - ksrc.add( type_src ); - ksrc.add( def_src ); - ksrc.add( writer_src ); - ksrc.add( reader_src ); + ksrc.add(type_src); + ksrc.add(def_src); + ksrc.add(writer_src); + ksrc.add(reader_src); int status = CL_SUCCESS; - status = create_single_kernel_helper_create_program(context, &program, ksrc.num_str(), ksrc.strs(), OPTIONS); - if(check_error(status, "Failed to build program for user type test (%s)", IGetErrorString(status))) + status = create_single_kernel_helper_create_program( + context, &program, ksrc.num_str(), ksrc.strs(), OPTIONS); + if (check_error(status, + "Failed to build program for user type test (%s)", + IGetErrorString(status))) { - print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS); + print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), + ksrc.lengths(), OPTIONS); return status; } status = clBuildProgram(program, 1, &device, OPTIONS, 0, 0); - if(check_error(status, "Failed to compile program for user type test (%s)", IGetErrorString(status))) + if (check_error(status, + "Failed to compile program for user type test (%s)", + IGetErrorString(status))) { - print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), OPTIONS); + print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), + ksrc.lengths(), OPTIONS); return status; } } @@ -1443,48 +1837,71 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu // Check size query. size_t used_bytes = 0; - int status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 ); - test_error_ret(status,"Failed to query global variable total size",status); + int status = clGetProgramBuildInfo( + program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, + sizeof(used_bytes), &used_bytes, 0); + test_error_ret(status, "Failed to query global variable total size", + status); size_t expected_size = sizeof(cl_uchar) + sizeof(cl_uint); - if ( used_bytes < expected_size ) { - log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes ); + if (used_bytes < expected_size) + { + log_error("Error: program query for global variable total size query " + "failed: Expected at least %llu but got %llu\n", + (unsigned long long)expected_size, + (unsigned long long)used_bytes); err |= 1; } // Prepare to execute - clKernelWrapper writer( clCreateKernel( program, "writer", &status ) ); - test_error_ret(status,"Failed to create writer kernel for user type test",status); - clKernelWrapper reader( clCreateKernel( program, "reader", &status ) ); - test_error_ret(status,"Failed to create reader kernel for user type test",status); + clKernelWrapper writer(clCreateKernel(program, "writer", &status)); + test_error_ret(status, "Failed to create writer kernel for user type test", + status); + clKernelWrapper reader(clCreateKernel(program, "reader", &status)); + test_error_ret(status, "Failed to create reader kernel for user type test", + status); // Set up data. cl_uchar* uchar_data = (cl_uchar*)align_malloc(sizeof(cl_uchar), ALIGNMENT); cl_uint* uint_data = (cl_uint*)align_malloc(sizeof(cl_uint), ALIGNMENT); - clMemWrapper uchar_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status ) ); - test_error_ret(status,"Failed to allocate uchar buffer",status); - clMemWrapper uint_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uint), uint_data, &status ) ); - test_error_ret(status,"Failed to allocate uint buffer",status); + clMemWrapper uchar_mem(clCreateBuffer( + context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status)); + test_error_ret(status, "Failed to allocate uchar buffer", status); + clMemWrapper uint_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + sizeof(cl_uint), uint_data, &status)); + test_error_ret(status, "Failed to allocate uint buffer", status); - status = clSetKernelArg(reader,0,sizeof(cl_mem),&uchar_mem); test_error_ret(status,"set arg",status); - status = clSetKernelArg(reader,1,sizeof(cl_mem),&uint_mem); test_error_ret(status,"set arg",status); + status = clSetKernelArg(reader, 0, sizeof(cl_mem), &uchar_mem); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(reader, 1, sizeof(cl_mem), &uint_mem); + test_error_ret(status, "set arg", status); cl_uchar expected_uchar = 'a'; cl_uint expected_uint = 42; - for ( unsigned iter = 0; iter < 5 ; iter++ ) { // Must go around at least twice + for (unsigned iter = 0; iter < 5; iter++) + { // Must go around at least twice // Read back data *uchar_data = -1; *uint_data = -1; const size_t one = 1; - status = clEnqueueNDRangeKernel(queue,reader,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue reader",status); - status = clFinish(queue); test_error_ret(status,"finish",status); - - cl_uchar *uint_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, 0, 0, 0); - cl_uchar *uchar_data_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0, 0, 0); - - if ( expected_uchar != *uchar_data || expected_uint != *uint_data ) { - log_error("FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n", - iter, (int)*uchar_data, *uint_data, (int)expected_uchar, expected_uint ); + status = clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue reader", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); + + cl_uchar* uint_data_ptr = + (cl_uchar*)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, + 0, sizeof(cl_uint), 0, 0, 0, 0); + cl_uchar* uchar_data_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0, + 0, 0); + + if (expected_uchar != *uchar_data || expected_uint != *uint_data) + { + log_error( + "FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n", + iter, (int)*uchar_data, *uint_data, (int)expected_uchar, + expected_uint); err |= 1; } @@ -1498,13 +1915,21 @@ static int l_user_type( cl_device_id device, cl_context context, cl_command_queu // Write the new values into persistent store. *uchar_data = expected_uchar; *uint_data = expected_uint; - status = clSetKernelArg(writer,0,sizeof(cl_uchar),uchar_data); test_error_ret(status,"set arg",status); - status = clSetKernelArg(writer,1,sizeof(cl_uint),uint_data); test_error_ret(status,"set arg",status); - status = clEnqueueNDRangeKernel(queue,writer,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue writer",status); - status = clFinish(queue); test_error_ret(status,"finish",status); + status = clSetKernelArg(writer, 0, sizeof(cl_uchar), uchar_data); + test_error_ret(status, "set arg", status); + status = clSetKernelArg(writer, 1, sizeof(cl_uint), uint_data); + test_error_ret(status, "set arg", status); + status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0); + test_error_ret(status, "enqueue writer", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); } - if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; } + if (CL_SUCCESS == err) + { + log_info("OK\n"); + FLUSH; + } align_free(uchar_data); align_free(uint_data); return err; @@ -1539,7 +1964,8 @@ static cl_int should_skip(cl_device_id device, cl_bool& skip) // Test support for variables at program scope. Miscellaneous -int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) +int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); @@ -1558,19 +1984,20 @@ int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, cl_com cl_int err = CL_SUCCESS; - err = l_get_device_info( device, &max_size, &pref_size ); - err |= l_build_type_table( device ); + err = l_get_device_info(device, &max_size, &pref_size); + err |= l_build_type_table(device); - err |= l_capacity( device, context, queue, max_size ); - err |= l_user_type( device, context, queue, false ); - err |= l_user_type( device, context, queue, true ); + err |= l_capacity(device, context, queue, max_size); + err |= l_user_type(device, context, queue, false); + err |= l_user_type(device, context, queue, true); return err; } // Test support for variables at program scope. Unitialized data -int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) +int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); @@ -1590,16 +2017,17 @@ int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, cl_c cl_int err = CL_SUCCESS; - err = l_get_device_info( device, &max_size, &pref_size ); - err |= l_build_type_table( device ); + err = l_get_device_info(device, &max_size, &pref_size); + err |= l_build_type_table(device); - err |= l_write_read( device, context, queue ); + err |= l_write_read(device, context, queue); return err; } // Test support for variables at program scope. Initialized data. -int test_progvar_prog_scope_init(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) +int test_progvar_prog_scope_init(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); @@ -1618,17 +2046,18 @@ int test_progvar_prog_scope_init(cl_device_id device, cl_context context, cl_com cl_int err = CL_SUCCESS; - err = l_get_device_info( device, &max_size, &pref_size ); - err |= l_build_type_table( device ); + err = l_get_device_info(device, &max_size, &pref_size); + err |= l_build_type_table(device); - err |= l_init_write_read( device, context, queue ); + err |= l_init_write_read(device, context, queue); return err; } // A simple test for support of static variables inside a kernel. -int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) +int test_progvar_func_scope(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); @@ -1642,56 +2071,70 @@ int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_ "supported on this device\n"); return TEST_SKIPPED_ITSELF; } - size_t max_size = 0; - size_t pref_size = 0; cl_int err = CL_SUCCESS; // Deliberately have two variables with the same name but in different // scopes. // Also, use a large initialized structure in both cases. + // clang-format off const char prog_src[] = "typedef struct { char c; int16 i; } mystruct_t;\n" - "kernel void test_bump( global int* value, int which ) {\n" - " if ( which ) {\n" + "kernel void test_bump(global int* value, int which) {\n" + " if (which) {\n" // Explicit address space. // Last element set to 0 - " static global mystruct_t persistent = {'a',(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0) };\n" + " static global mystruct_t persistent = { 'a', (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0) };\n" " *value = persistent.i.sf++;\n" " } else {\n" // Implicitly global // Last element set to 100 - " static mystruct_t persistent = {'b',(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,100) };\n" + " static mystruct_t persistent = { 'b' , (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,100) };\n" " *value = persistent.i.sf++;\n" " }\n" "}\n"; + // clang-format on StringTable ksrc; - ksrc.add( prog_src ); + ksrc.add(prog_src); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper test_bump; - status = create_single_kernel_helper_with_build_options(context, &program, &test_bump, ksrc.num_str(), ksrc.strs(), "test_bump", OPTIONS); - test_error_ret(status, "Failed to create program for function static variable test", status); + status = create_single_kernel_helper_with_build_options( + context, &program, &test_bump, ksrc.num_str(), ksrc.strs(), "test_bump", + OPTIONS); + test_error_ret(status, + "Failed to create program for function static variable test", + status); // Check size query. size_t used_bytes = 0; - status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0 ); - test_error_ret(status,"Failed to query global variable total size",status); + status = clGetProgramBuildInfo(program, device, + CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, + sizeof(used_bytes), &used_bytes, 0); + test_error_ret(status, "Failed to query global variable total size", + status); size_t expected_size = 2 * sizeof(cl_int); // Two ints. - if ( used_bytes < expected_size ) { - log_error("Error: program query for global variable total size query failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes ); + if (used_bytes < expected_size) + { + log_error("Error: program query for global variable total size query " + "failed: Expected at least %llu but got %llu\n", + (unsigned long long)expected_size, + (unsigned long long)used_bytes); err |= 1; } // Prepare the data. cl_int counter_value = 0; - clMemWrapper counter_value_mem( clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(counter_value), &counter_value, &status ) ); - test_error_ret(status,"Failed to allocate counter query buffer",status); + clMemWrapper counter_value_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + sizeof(counter_value), + &counter_value, &status)); + test_error_ret(status, "Failed to allocate counter query buffer", status); - status = clSetKernelArg(test_bump,0,sizeof(cl_mem),&counter_value_mem); test_error_ret(status,"set arg",status); + status = clSetKernelArg(test_bump, 0, sizeof(cl_mem), &counter_value_mem); + test_error_ret(status, "set arg", status); // Go a few rounds, alternating between the two counters in the kernel. @@ -1701,26 +2144,41 @@ int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_ cl_int expected_counter[2] = { 100, 0 }; const size_t one = 1; - for ( int iround = 0; iround < 5 ; iround++ ) { // Must go at least twice around - for ( int iwhich = 0; iwhich < 2 ; iwhich++ ) { // Cover both counters - status = clSetKernelArg(test_bump,1,sizeof(iwhich),&iwhich); test_error_ret(status,"set arg",status); - status = clEnqueueNDRangeKernel(queue,test_bump,1,0,&one,0,0,0,0); test_error_ret(status,"enqueue test_bump",status); - status = clFinish(queue); test_error_ret(status,"finish",status); - - cl_uchar *counter_value_ptr = (cl_uchar *)clEnqueueMapBuffer(queue, counter_value_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(counter_value), 0, 0, 0, 0); - - if ( counter_value != expected_counter[iwhich] ) { - log_error("Error: Round %d on counter %d: Expected %d but got %d\n", - iround, iwhich, expected_counter[iwhich], counter_value ); + for (int iround = 0; iround < 5; iround++) + { // Must go at least twice around + for (int iwhich = 0; iwhich < 2; iwhich++) + { // Cover both counters + status = clSetKernelArg(test_bump, 1, sizeof(iwhich), &iwhich); + test_error_ret(status, "set arg", status); + status = clEnqueueNDRangeKernel(queue, test_bump, 1, 0, &one, 0, 0, + 0, 0); + test_error_ret(status, "enqueue test_bump", status); + status = clFinish(queue); + test_error_ret(status, "finish", status); + + cl_uchar* counter_value_ptr = (cl_uchar*)clEnqueueMapBuffer( + queue, counter_value_mem, CL_TRUE, CL_MAP_READ, 0, + sizeof(counter_value), 0, 0, 0, 0); + + if (counter_value != expected_counter[iwhich]) + { + log_error( + "Error: Round %d on counter %d: Expected %d but got %d\n", + iround, iwhich, expected_counter[iwhich], counter_value); err |= 1; } expected_counter[iwhich]++; // Emulate behaviour of the kernel. - clEnqueueUnmapMemObject(queue, counter_value_mem, counter_value_ptr, 0, 0, 0); + clEnqueueUnmapMemObject(queue, counter_value_mem, counter_value_ptr, + 0, 0, 0); } } - if ( CL_SUCCESS == err ) { log_info("OK\n"); FLUSH; } + if (CL_SUCCESS == err) + { + log_info("OK\n"); + FLUSH; + } return err; } diff --git a/test_conformance/basic/test_queue_priority.cpp b/test_conformance/basic/test_queue_priority.cpp index 57ce5041..ff6283cd 100644 --- a/test_conformance/basic/test_queue_priority.cpp +++ b/test_conformance/basic/test_queue_priority.cpp @@ -48,13 +48,9 @@ static const char *fpmul_kernel_code = " dst[tid] = srcA[tid] * srcB[tid];\n" "}\n"; - -static const float MAX_ERR = 1e-5f; - static int verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n, int fileNum) { - float r; int i; float * reference_ptr = (float *)malloc(n * sizeof(float)); @@ -82,7 +78,6 @@ verify_fpadd(float *inptrA, float *inptrB, float *outptr, int n, int fileNum) static int verify_fpsub(float *inptrA, float *inptrB, float *outptr, int n, int fileNum) { - float r; int i; float * reference_ptr = (float *)malloc(n * sizeof(float)); @@ -110,7 +105,6 @@ verify_fpsub(float *inptrA, float *inptrB, float *outptr, int n, int fileNum) static int verify_fpmul(float *inptrA, float *inptrB, float *outptr, int n, int fileNum) { - float r; int i; float * reference_ptr = (float *)malloc(n * sizeof(float)); diff --git a/test_conformance/basic/test_readimage3d.cpp b/test_conformance/basic/test_readimage3d.cpp index 1337c9fb..5fd7d109 100644 --- a/test_conformance/basic/test_readimage3d.cpp +++ b/test_conformance/basic/test_readimage3d.cpp @@ -142,7 +142,7 @@ int test_readimage3d(cl_device_id device, cl_context context, cl_command_queue q int img_width = 64; int img_height = 64; int img_depth = 64; - int i, err; + int err; size_t origin[3] = {0, 0, 0}; size_t region[3] = {img_width, img_height, img_depth}; size_t length = img_width * img_height * img_depth * 4 * sizeof(float); diff --git a/test_conformance/basic/test_simple_image_pitch.cpp b/test_conformance/basic/test_simple_image_pitch.cpp index 1cd82b6f..2eb43b3a 100644 --- a/test_conformance/basic/test_simple_image_pitch.cpp +++ b/test_conformance/basic/test_simple_image_pitch.cpp @@ -83,7 +83,7 @@ int test_simple_read_image_pitch(cl_device_id device, cl_context cl_context_, cl free(host_image); free(host_buffer); - return CL_SUCCESS; + return errors == 0 ? TEST_PASS : TEST_FAIL; } int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, cl_command_queue q, int num_elements) @@ -149,5 +149,5 @@ int test_simple_write_image_pitch(cl_device_id device, cl_context cl_context_, c free(host_image); free(host_buffer); - return CL_SUCCESS; + return errors == 0 ? TEST_PASS : TEST_FAIL; } diff --git a/test_conformance/basic/test_sizeof.cpp b/test_conformance/basic/test_sizeof.cpp index 66a6c563..e980ed68 100644 --- a/test_conformance/basic/test_sizeof.cpp +++ b/test_conformance/basic/test_sizeof.cpp @@ -35,9 +35,9 @@ cl_int get_type_size( cl_context context, cl_command_queue queue, const char *ty "}\n" }; - cl_program p; - cl_kernel k; - cl_mem m; + clProgramWrapper p; + clKernelWrapper k; + clMemWrapper m; cl_uint temp; @@ -51,42 +51,19 @@ cl_int get_type_size( cl_context context, cl_command_queue queue, const char *ty } cl_int err = create_single_kernel_helper_with_build_options( context, &p, &k, 4, sizeof_kernel_code, "test_sizeof", nullptr); - if( err ) - return err; + test_error(err, "Failed to build kernel/program."); m = clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof( cl_ulong ), size, &err ); - if( NULL == m ) - { - clReleaseProgram( p ); - clReleaseKernel( k ); - log_error("\nclCreateBuffer FAILED\n"); - return err; - } + test_error(err, "clCreateBuffer failed."); err = clSetKernelArg( k, 0, sizeof( cl_mem ), &m ); - if( err ) - { - clReleaseProgram( p ); - clReleaseKernel( k ); - clReleaseMemObject( m ); - log_error("\nclSetKernelArg FAILED\n"); - return err; - } + test_error(err, "clSetKernelArg failed."); err = clEnqueueTask( queue, k, 0, NULL, NULL ); - clReleaseProgram( p ); - clReleaseKernel( k ); - if( err ) - { - clReleaseMemObject( m ); - log_error( "\nclEnqueueTask FAILED\n" ); - return err; - } + test_error(err, "clEnqueueTask failed."); err = clEnqueueReadBuffer( queue, m, CL_TRUE, 0, sizeof( cl_uint ), &temp, 0, NULL, NULL ); - clReleaseMemObject( m ); - if( err ) - log_error( "\nclEnqueueReadBuffer FAILED\n" ); + test_error(err, "clEnqueueReadBuffer failed."); *size = (cl_ulong) temp; @@ -292,11 +269,11 @@ int test_sizeof(cl_device_id device, cl_context context, cl_command_queue queue, continue; } - if( gIsEmbedded && - 0 == strcmp(other_types[i], "image3d_t") && - checkFor3DImageSupport( device ) == CL_IMAGE_FORMAT_NOT_SUPPORTED) + if (0 == strcmp(other_types[i], "image3d_t") + && checkFor3DImageSupport(device) == CL_IMAGE_FORMAT_NOT_SUPPORTED) { - log_info("\n3D images are not supported by this device. Skipping test.\t"); + log_info("\n3D images are not supported by this device. " + "Skipping test.\t"); continue; } diff --git a/test_conformance/basic/test_vector_swizzle.cpp b/test_conformance/basic/test_vector_swizzle.cpp index 5ab3ea4f..884bcf36 100644 --- a/test_conformance/basic/test_vector_swizzle.cpp +++ b/test_conformance/basic/test_vector_swizzle.cpp @@ -610,9 +610,6 @@ static int test_vectype(const char* type_name, cl_device_id device, cl_int error = CL_SUCCESS; int result = TEST_PASS; - clProgramWrapper program; - clKernelWrapper kernel; - std::string buildOptions{ "-DTYPE=" }; buildOptions += type_name; buildOptions += std::to_string(N); @@ -628,35 +625,50 @@ static int test_vectype(const char* type_name, cl_device_id device, makeReference<T, N, S>(reference); // XYZW swizzles: + { + clProgramWrapper program; + clKernelWrapper kernel; - const char* xyzw_source = TestInfo<N>::kernel_source_xyzw; - error = create_single_kernel_helper( - context, &program, &kernel, 1, &xyzw_source, "test_vector_swizzle_xyzw", - buildOptions.c_str()); - test_error(error, "Unable to create xyzw test kernel"); + const char* xyzw_source = TestInfo<N>::kernel_source_xyzw; + error = create_single_kernel_helper( + context, &program, &kernel, 1, &xyzw_source, + "test_vector_swizzle_xyzw", buildOptions.c_str()); + test_error(error, "Unable to create xyzw test kernel"); - result |= test_vectype_case(value, reference, context, kernel, queue); + result |= test_vectype_case(value, reference, context, kernel, queue); + } // sN swizzles: - const char* sN_source = TestInfo<N>::kernel_source_sN; - error = create_single_kernel_helper(context, &program, &kernel, 1, - &sN_source, "test_vector_swizzle_sN", - buildOptions.c_str()); - test_error(error, "Unable to create sN test kernel"); + { + clProgramWrapper program; + clKernelWrapper kernel; + + const char* sN_source = TestInfo<N>::kernel_source_sN; + error = create_single_kernel_helper( + context, &program, &kernel, 1, &sN_source, "test_vector_swizzle_sN", + buildOptions.c_str()); + test_error(error, "Unable to create sN test kernel"); - result |= test_vectype_case(value, reference, context, kernel, queue); + result |= test_vectype_case(value, reference, context, kernel, queue); + } // RGBA swizzles for OpenCL 3.0 and newer: - const Version device_version = get_device_cl_version(device); - if (device_version >= Version(3, 0)) { - const char* rgba_source = TestInfo<N>::kernel_source_rgba; - error = create_single_kernel_helper( - context, &program, &kernel, 1, &rgba_source, - "test_vector_swizzle_rgba", buildOptions.c_str()); - test_error(error, "Unable to create rgba test kernel"); + clProgramWrapper program; + clKernelWrapper kernel; - result |= test_vectype_case(value, reference, context, kernel, queue); + const Version device_version = get_device_cl_version(device); + if (device_version >= Version(3, 0)) + { + const char* rgba_source = TestInfo<N>::kernel_source_rgba; + error = create_single_kernel_helper( + context, &program, &kernel, 1, &rgba_source, + "test_vector_swizzle_rgba", buildOptions.c_str()); + test_error(error, "Unable to create rgba test kernel"); + + result |= + test_vectype_case(value, reference, context, kernel, queue); + } } return result; diff --git a/test_conformance/basic/test_writeimage_fp32.cpp b/test_conformance/basic/test_writeimage_fp32.cpp index fef71874..c68463ac 100644 --- a/test_conformance/basic/test_writeimage_fp32.cpp +++ b/test_conformance/basic/test_writeimage_fp32.cpp @@ -122,9 +122,10 @@ int test_writeimage_fp32(cl_device_id device, cl_context context, cl_command_que return -1; } - err = create_single_kernel_helper(context, &program, &kernel[0], 1, &rgbaFFFF_write_kernel_code, "test_rgbaFFFF_write" ); - if (err) - return -1; + err = create_single_kernel_helper(context, &program, &kernel[0], 1, + &rgbaFFFF_write_kernel_code, + "test_rgbaFFFF_write"); + if (err) return -1; kernel[1] = clCreateKernel(program, "test_rgbaFFFF_write", NULL); if (!kernel[1]) { diff --git a/test_conformance/basic/test_writeimage_int16.cpp b/test_conformance/basic/test_writeimage_int16.cpp index 8afb77a9..d863a3a3 100644 --- a/test_conformance/basic/test_writeimage_int16.cpp +++ b/test_conformance/basic/test_writeimage_int16.cpp @@ -128,9 +128,10 @@ int test_writeimage_int16(cl_device_id device, cl_context context, cl_command_qu return -1; } - err = create_single_kernel_helper(context, &program, &kernel[0], 1, &rgba16_write_kernel_code, "test_rgba16_write" ); - if (err) - return -1; + err = create_single_kernel_helper(context, &program, &kernel[0], 1, + &rgba16_write_kernel_code, + "test_rgba16_write"); + if (err) return -1; kernel[1] = clCreateKernel(program, "test_rgba16_write", NULL); if (!kernel[1]) { diff --git a/test_conformance/buffers/test_buffer_fill.cpp b/test_conformance/buffers/test_buffer_fill.cpp index 9c9c7d17..92079794 100644 --- a/test_conformance/buffers/test_buffer_fill.cpp +++ b/test_conformance/buffers/test_buffer_fill.cpp @@ -703,8 +703,6 @@ int test_buffer_fill( cl_device_id deviceID, cl_context context, cl_command_queu int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) { TestStruct pattern; - clProgramWrapper program; - clKernelWrapper kernel; size_t ptrSize = sizeof( TestStruct ); size_t global_work_size[3]; int n, err; @@ -720,6 +718,8 @@ int test_buffer_fill_struct( cl_device_id deviceID, cl_context context, cl_comma for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++) { + clProgramWrapper program; + clKernelWrapper kernel; log_info("Testing with cl_mem_flags: %s\n", flag_set_names[src_flag_id]); diff --git a/test_conformance/buffers/test_buffer_migrate.cpp b/test_conformance/buffers/test_buffer_migrate.cpp index f3098366..6cdc271b 100644 --- a/test_conformance/buffers/test_buffer_migrate.cpp +++ b/test_conformance/buffers/test_buffer_migrate.cpp @@ -80,7 +80,7 @@ static cl_int migrateMemObject(enum migrations migrate, cl_command_queue *queues static cl_int restoreBuffer(cl_command_queue *queues, cl_mem *buffers, cl_uint num_devices, cl_mem_migration_flags *flags, cl_uint *buffer) { - cl_uint i, j; + cl_uint i; cl_int err; // If the buffer was previously migrated with undefined content, reload the content. diff --git a/test_conformance/buffers/test_buffer_read.cpp b/test_conformance/buffers/test_buffer_read.cpp index 39cf3297..49a57f92 100644 --- a/test_conformance/buffers/test_buffer_read.cpp +++ b/test_conformance/buffers/test_buffer_read.cpp @@ -763,7 +763,6 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman { clProgramWrapper program[5]; clKernelWrapper kernel[5]; - clEventWrapper event; void *outptr[5]; void *inptr[5]; size_t global_work_size[3]; @@ -805,6 +804,7 @@ int test_buffer_read_async( cl_device_id deviceID, cl_context context, cl_comman for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++) { clMemWrapper buffer; + clEventWrapper event; outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment); if ( ! outptr[i] ){ log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) ); @@ -900,7 +900,6 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c { clProgramWrapper program[5]; clKernelWrapper kernel[5]; - clEventWrapper event; void *outptr[5], *inptr[5]; size_t global_work_size[3]; cl_int err; @@ -941,6 +940,7 @@ int test_buffer_read_array_barrier( cl_device_id deviceID, cl_context context, c for (src_flag_id = 0; src_flag_id < NUM_FLAGS; src_flag_id++) { clMemWrapper buffer; + clEventWrapper event; outptr[i] = align_malloc(ptrSizes[i] * num_elements, min_alignment); if ( ! outptr[i] ){ log_error( " unable to allocate %d bytes for outptr\n", (int)(ptrSizes[i] * num_elements) ); diff --git a/test_conformance/buffers/test_image_migrate.cpp b/test_conformance/buffers/test_image_migrate.cpp index dbdca9cc..6c8acdce 100644 --- a/test_conformance/buffers/test_image_migrate.cpp +++ b/test_conformance/buffers/test_image_migrate.cpp @@ -128,7 +128,6 @@ int test_image_migrate(cl_device_id deviceID, cl_context context, cl_command_que cl_mem_migration_flags *flagsA, *flagsB, *flagsC; cl_device_partition_property property[] = {CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0, 0}; cl_mem *imageA, *imageB, *imageC; - cl_mem_flags flags; cl_image_format format; cl_sampler sampler = NULL; cl_program program = NULL; diff --git a/test_conformance/buffers/test_sub_buffers.cpp b/test_conformance/buffers/test_sub_buffers.cpp index 3e50121a..d6ab111e 100644 --- a/test_conformance/buffers/test_sub_buffers.cpp +++ b/test_conformance/buffers/test_sub_buffers.cpp @@ -15,6 +15,8 @@ // #include "procs.h" +#include <algorithm> + // Design: // To test sub buffers, we first create one main buffer. We then create several sub-buffers and // queue Actions on each one. Each Action is encapsulated in a class so it can keep track of @@ -39,7 +41,8 @@ public: region.size = mSize; cl_int error; - mMem = clCreateSubBuffer( mParentBuffer, flags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error ); + reset(clCreateSubBuffer(mParentBuffer, flags, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error)); return error; } }; @@ -100,13 +103,6 @@ public: } }; -#ifndef MAX -#define MAX( _a, _b ) ( (_a) > (_b) ? (_a) : (_b) ) -#endif -#ifndef MIN -#define MIN( _a, _b ) ( (_a) < (_b) ? (_a) : (_b) ) -#endif - class CopyAction : public Action { public: @@ -116,7 +112,8 @@ public: virtual cl_int Execute( cl_context context, cl_command_queue queue, cl_char tag, SubBufferWrapper &buffer1, SubBufferWrapper &buffer2, cl_char *parentBufferState ) { // Copy from sub-buffer 1 to sub-buffer 2 - size_t size = get_random_size_t( 0, MIN( buffer1.mSize, buffer2.mSize ), GetRandSeed() ); + size_t size = get_random_size_t( + 0, std::min(buffer1.mSize, buffer2.mSize), GetRandSeed()); size_t startOffset = get_random_size_t( 0, buffer1.mSize - size, GetRandSeed() ); size_t endOffset = get_random_size_t( 0, buffer2.mSize - size, GetRandSeed() ); @@ -265,7 +262,11 @@ int test_sub_buffers_read_write_core( cl_context context, cl_command_queue queue endRange = mainSize; size_t offset = get_random_size_t( toStartFrom / addressAlign, endRange / addressAlign, Action::GetRandSeed() ) * addressAlign; - size_t size = get_random_size_t( 1, ( MIN( mainSize / 8, mainSize - offset ) ) / addressAlign, Action::GetRandSeed() ) * addressAlign; + size_t size = + get_random_size_t( + 1, (std::min(mainSize / 8, mainSize - offset)) / addressAlign, + Action::GetRandSeed()) + * addressAlign; error = subBuffers[ numSubBuffers ].Allocate( mainBuffer, CL_MEM_READ_WRITE, offset, size ); test_error( error, "Unable to allocate sub buffer" ); @@ -442,7 +443,7 @@ int test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context error = get_reasonable_buffer_size( otherDevice, maxBuffer2 ); test_error( error, "Unable to get buffer size for secondary device" ); - maxBuffer1 = MIN( maxBuffer1, maxBuffer2 ); + maxBuffer1 = std::min(maxBuffer1, maxBuffer2); cl_uint addressAlign1Bits, addressAlign2Bits; error = clGetDeviceInfo( deviceID, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign1Bits ), &addressAlign1Bits, NULL ); @@ -451,7 +452,7 @@ int test_sub_buffers_read_write_dual_devices( cl_device_id deviceID, cl_context error = clGetDeviceInfo( otherDevice, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof( addressAlign2Bits ), &addressAlign2Bits, NULL ); test_error( error, "Unable to get secondary device's address alignment" ); - cl_uint addressAlign1 = MAX( addressAlign1Bits, addressAlign2Bits ) / 8; + cl_uint addressAlign1 = std::max(addressAlign1Bits, addressAlign2Bits) / 8; // Finally time to run! return test_sub_buffers_read_write_core( testingContext, queue1, queue2, maxBuffer1, addressAlign1 ); diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h index bbcc68c6..6c7d0b12 100644 --- a/test_conformance/c11_atomics/common.h +++ b/test_conformance/c11_atomics/common.h @@ -28,10 +28,9 @@ #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads) #define MAX_HOST_THREADS GetThreadCount() -#define EXECUTE_TEST(error, test)\ - error |= test;\ - if(error && !gContinueOnError)\ - return error; +#define EXECUTE_TEST(error, test) \ + error |= test; \ + if (error && !gContinueOnError) return error; enum TExplicitAtomicType { @@ -57,764 +56,918 @@ enum TExplicitMemoryScopeType MEMORY_SCOPE_ALL_SVM_DEVICES }; -extern bool gHost; // temporary flag for testing native host threads (test verification) +extern bool + gHost; // temporary flag for testing native host threads (test verification) extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2) extern bool gContinueOnError; // execute all cases even when errors detected -extern bool gNoGlobalVariables; // disable cases with global atomics in program scope +extern bool + gNoGlobalVariables; // disable cases with global atomics in program scope extern bool gNoGenericAddressSpace; // disable cases with generic address space extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree extern bool gDebug; // print OpenCL kernel code -extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity -extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device +extern int gInternalIterations; // internal test iterations for atomic + // operation, sufficient to verify atomicity +extern int + gMaxDeviceThreads; // maximum number of threads executed on OCL device extern cl_device_atomic_capabilities gAtomicMemCap, gAtomicFenceCap; // atomic memory and fence capabilities for this device -extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType); -extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType); +extern const char * +get_memory_order_type_name(TExplicitMemoryOrderType orderType); +extern const char * +get_memory_scope_type_name(TExplicitMemoryScopeType scopeType); extern cl_int getSupportedMemoryOrdersAndScopes( cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders, std::vector<TExplicitMemoryScopeType> &memoryScopes); -class AtomicTypeInfo -{ +class AtomicTypeInfo { public: - TExplicitAtomicType _type; - AtomicTypeInfo(TExplicitAtomicType type): _type(type) {} - cl_uint Size(cl_device_id device); - const char* AtomicTypeName(); - const char* RegularTypeName(); - const char* AddSubOperandTypeName(); - int IsSupported(cl_device_id device); + TExplicitAtomicType _type; + AtomicTypeInfo(TExplicitAtomicType type): _type(type) {} + cl_uint Size(cl_device_id device); + const char *AtomicTypeName(); + const char *RegularTypeName(); + const char *AddSubOperandTypeName(); + int IsSupported(cl_device_id device); }; -template<typename HostDataType> -class AtomicTypeExtendedInfo : public AtomicTypeInfo -{ +template <typename HostDataType> +class AtomicTypeExtendedInfo : public AtomicTypeInfo { public: - AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {} - HostDataType MinValue(); - HostDataType MaxValue(); - HostDataType SpecialValue(cl_uchar x) - { - HostDataType tmp; - cl_uchar *ptr = (cl_uchar*)&tmp; - for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++) - ptr[i] = x; - return tmp; - } - HostDataType SpecialValue(cl_ushort x) - { - HostDataType tmp; - cl_ushort *ptr = (cl_ushort*)&tmp; - for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++) - ptr[i] = x; - return tmp; - } + AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {} + HostDataType MinValue(); + HostDataType MaxValue(); + HostDataType SpecialValue(cl_uchar x) + { + HostDataType tmp; + cl_uchar *ptr = (cl_uchar *)&tmp; + for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++) + ptr[i] = x; + return tmp; + } + HostDataType SpecialValue(cl_ushort x) + { + HostDataType tmp; + cl_ushort *ptr = (cl_ushort *)&tmp; + for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++) + ptr[i] = x; + return tmp; + } }; -class CTest { +class CTest { public: - virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0; + virtual int Execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) = 0; }; -template<typename HostAtomicType, typename HostDataType> -class CBasicTest : CTest -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTest : CTest { public: - typedef struct { - CBasicTest *test; - cl_uint tid; - cl_uint threadCount; - volatile HostAtomicType *destMemory; - HostDataType *oldValues; - } THostThreadContext; - static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo) - { - THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id; - threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues); - return 0; - } - CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(), - _maxDeviceThreads(MAX_DEVICE_THREADS), - _dataType(dataType), _useSVM(useSVM), _startValue(255), - _localMemory(false), _declaredInProgram(false), - _usedInFunction(false), _genericAddrSpace(false), - _oldValueCheck(true), _localRefValues(false), - _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations) - { - } - virtual ~CBasicTest() - { - if(_passCount) - log_info(" %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName()); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return 1; - } - virtual cl_uint NumNonAtomicVariablesPerThread() - { - return 1; - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - return false; - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - return false; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - return false; - } - virtual std::string PragmaHeader(cl_device_id deviceID); - virtual std::string ProgramHeader(cl_uint maxNumDestItems); - virtual std::string FunctionCode(); - virtual std::string KernelCode(cl_uint maxNumDestItems); - virtual std::string ProgramCore() = 0; - virtual std::string SingleTestName() - { - std::string testName = LocalMemory() ? "local" : "global"; - testName += " "; - testName += DataType().AtomicTypeName(); - if(DeclaredInProgram()) - { - testName += " declared in program"; - } - if(DeclaredInProgram() && UsedInFunction()) - testName += ","; - if(UsedInFunction()) - { - testName += " used in "; - if(GenericAddrSpace()) - testName += "generic "; - testName += "function"; - } - return testName; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue); - int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - UsedInFunction(false); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - UsedInFunction(true); - GenericAddrSpace(false); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - GenericAddrSpace(true); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - GenericAddrSpace(false); - return error; - } - int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - DeclaredInProgram(false); - EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); - if(!UseSVM()) - { - DeclaredInProgram(true); - EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); - } - return error; - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - if(_maxDeviceThreads > 0 && !UseSVM()) - { - LocalMemory(true); - EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); - } - if(_maxDeviceThreads+MaxHostThreads() > 0) - { - LocalMemory(false); - EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); - } - return error; - } - virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) - { - if(sizeof(HostAtomicType) != DataType().Size(deviceID)) - { - log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID)); - return -1; - } - if(sizeof(HostAtomicType) != sizeof(HostDataType)) - { - log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType)); - return -1; - } - // Verify we can run first - if(UseSVM() && !gUseHostPtr) - { - cl_device_svm_capabilities caps; - cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0); - test_error(error, "clGetDeviceInfo failed"); - if((caps & CL_DEVICE_SVM_ATOMICS) == 0) - { - log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName()); - // implicit pass + typedef struct + { + CBasicTest *test; + cl_uint tid; + cl_uint threadCount; + volatile HostAtomicType *destMemory; + HostDataType *oldValues; + } THostThreadContext; + static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, + void *userInfo) + { + THostThreadContext *threadContext = + ((THostThreadContext *)userInfo) + job_id; + threadContext->test->HostFunction( + threadContext->tid, threadContext->threadCount, + threadContext->destMemory, threadContext->oldValues); return 0; - } } - if(!DataType().IsSupported(deviceID)) + CBasicTest(TExplicitAtomicType dataType, bool useSVM) + : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType), + _useSVM(useSVM), _startValue(255), _localMemory(false), + _declaredInProgram(false), _usedInFunction(false), + _genericAddrSpace(false), _oldValueCheck(true), + _localRefValues(false), _maxGroupSize(0), _passCount(0), + _iterations(gInternalIterations) + {} + virtual ~CBasicTest() + { + if (_passCount) + log_info(" %u tests executed successfully for %s\n", _passCount, + DataType().AtomicTypeName()); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return 1; + } + virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + return false; + } + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) + { + return false; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + return false; + } + virtual std::string PragmaHeader(cl_device_id deviceID); + virtual std::string ProgramHeader(cl_uint maxNumDestItems); + virtual std::string FunctionCode(); + virtual std::string KernelCode(cl_uint maxNumDestItems); + virtual std::string ProgramCore() = 0; + virtual std::string SingleTestName() + { + std::string testName = LocalMemory() ? "local" : "global"; + testName += " "; + testName += DataType().AtomicTypeName(); + if (DeclaredInProgram()) + { + testName += " declared in program"; + } + if (DeclaredInProgram() && UsedInFunction()) testName += ","; + if (UsedInFunction()) + { + testName += " used in "; + if (GenericAddrSpace()) testName += "generic "; + testName += "function"; + } + return testName; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue); + int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + int error = 0; + UsedInFunction(false); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + UsedInFunction(true); + GenericAddrSpace(false); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + GenericAddrSpace(true); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + GenericAddrSpace(false); + return error; + } + int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + int error = 0; + DeclaredInProgram(false); + EXECUTE_TEST(error, + ExecuteForEachPointerType(deviceID, context, queue)); + if (!UseSVM()) + { + DeclaredInProgram(true); + EXECUTE_TEST(error, + ExecuteForEachPointerType(deviceID, context, queue)); + } + return error; + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + int error = 0; + if (_maxDeviceThreads > 0 && !UseSVM()) + { + LocalMemory(true); + EXECUTE_TEST( + error, ExecuteForEachDeclarationType(deviceID, context, queue)); + } + if (_maxDeviceThreads + MaxHostThreads() > 0) + { + LocalMemory(false); + EXECUTE_TEST( + error, ExecuteForEachDeclarationType(deviceID, context, queue)); + } + return error; + } + virtual int Execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) + { + if (sizeof(HostAtomicType) != DataType().Size(deviceID)) + { + log_info("Invalid test: Host atomic type size (%u) is different " + "than OpenCL type size (%u)\n", + (cl_uint)sizeof(HostAtomicType), + DataType().Size(deviceID)); + return -1; + } + if (sizeof(HostAtomicType) != sizeof(HostDataType)) + { + log_info("Invalid test: Host atomic type size (%u) is different " + "than corresponding type size (%u)\n", + (cl_uint)sizeof(HostAtomicType), + (cl_uint)sizeof(HostDataType)); + return -1; + } + // Verify we can run first + if (UseSVM() && !gUseHostPtr) + { + cl_device_svm_capabilities caps; + cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, + sizeof(caps), &caps, 0); + test_error(error, "clGetDeviceInfo failed"); + if ((caps & CL_DEVICE_SVM_ATOMICS) == 0) + { + log_info("\t%s - SVM_ATOMICS not supported\n", + DataType().AtomicTypeName()); + // implicit pass + return 0; + } + } + if (!DataType().IsSupported(deviceID)) + { + log_info("\t%s not supported\n", DataType().AtomicTypeName()); + // implicit pass or host test (debug feature) + if (UseSVM()) return 0; + _maxDeviceThreads = 0; + } + if (_maxDeviceThreads + MaxHostThreads() == 0) return 0; + return ExecuteForEachParameterSet(deviceID, context, queue); + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + log_info("Empty thread function %u\n", (cl_uint)tid); + } + AtomicTypeExtendedInfo<HostDataType> DataType() const + { + return AtomicTypeExtendedInfo<HostDataType>(_dataType); + } + cl_uint _maxDeviceThreads; + virtual cl_uint MaxHostThreads() + { + if (UseSVM() || gHost) + return MAX_HOST_THREADS; + else + return 0; + } + + int CheckCapabilities(TExplicitMemoryScopeType memoryScope, + TExplicitMemoryOrderType memoryOrder) { - log_info("\t%s not supported\n", DataType().AtomicTypeName()); - // implicit pass or host test (debug feature) - if(UseSVM()) + /* + Differentiation between atomic fence and other atomic operations + does not need to occur here. + + The initialisation of this test checks that the minimum required + capabilities are supported by this device. + + The following switches allow the test to skip if optional + capabilites are not supported by the device. + */ + switch (memoryScope) + { + case MEMORY_SCOPE_EMPTY: { + break; + } + case MEMORY_SCOPE_WORK_GROUP: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_SCOPE_DEVICE: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_SCOPE_ALL_DEVICES: // fallthough + case MEMORY_SCOPE_ALL_SVM_DEVICES: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + default: { + log_info("Invalid memory scope\n"); + break; + } + } + + switch (memoryOrder) + { + case MEMORY_ORDER_EMPTY: { + break; + } + case MEMORY_ORDER_RELAXED: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_ORDER_ACQUIRE: + case MEMORY_ORDER_RELEASE: + case MEMORY_ORDER_ACQ_REL: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_ORDER_SEQ_CST: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + default: { + log_info("Invalid memory order\n"); + break; + } + } + return 0; - _maxDeviceThreads = 0; - } - if(_maxDeviceThreads+MaxHostThreads() == 0) - return 0; - return ExecuteForEachParameterSet(deviceID, context, queue); - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - log_info("Empty thread function %u\n", (cl_uint)tid); - } - AtomicTypeExtendedInfo<HostDataType> DataType() const - { - return AtomicTypeExtendedInfo<HostDataType>(_dataType); - } - cl_uint _maxDeviceThreads; - virtual cl_uint MaxHostThreads() - { - if(UseSVM() || gHost) - return MAX_HOST_THREADS; - else - return 0; - } - - int CheckCapabilities(TExplicitMemoryScopeType memoryScope, - TExplicitMemoryOrderType memoryOrder) - { - /* - Differentiation between atomic fence and other atomic operations - does not need to occur here. - - The initialisation of this test checks that the minimum required - capabilities are supported by this device. - - The following switches allow the test to skip if optional capabilites - are not supported by the device. - */ - switch (memoryScope) - { - case MEMORY_SCOPE_EMPTY: { - break; - } - case MEMORY_SCOPE_WORK_GROUP: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_SCOPE_DEVICE: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_SCOPE_ALL_DEVICES: // fallthough - case MEMORY_SCOPE_ALL_SVM_DEVICES: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - default: { - log_info("Invalid memory scope\n"); - break; - } - } - - switch (memoryOrder) - { - case MEMORY_ORDER_EMPTY: { - break; - } - case MEMORY_ORDER_RELAXED: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_ORDER_ACQUIRE: - case MEMORY_ORDER_RELEASE: - case MEMORY_ORDER_ACQ_REL: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_ORDER_SEQ_CST: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - default: { - log_info("Invalid memory order\n"); - break; - } - } - - return 0; - } - virtual bool SVMDataBufferAllSVMConsistent() {return false;} - bool UseSVM() {return _useSVM;} - void StartValue(HostDataType startValue) {_startValue = startValue;} - HostDataType StartValue() {return _startValue;} - void LocalMemory(bool local) {_localMemory = local;} - bool LocalMemory() {return _localMemory;} - void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;} - bool DeclaredInProgram() {return _declaredInProgram;} - void UsedInFunction(bool local) {_usedInFunction = local;} - bool UsedInFunction() {return _usedInFunction;} - void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;} - bool GenericAddrSpace() {return _genericAddrSpace;} - void OldValueCheck(bool check) {_oldValueCheck = check;} - bool OldValueCheck() {return _oldValueCheck;} - void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;} - bool LocalRefValues() {return _localRefValues;} - void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;} - cl_uint MaxGroupSize() {return _maxGroupSize;} - void CurrentGroupSize(cl_uint currentGroupSize) - { - if(MaxGroupSize() && MaxGroupSize() < currentGroupSize) - _currentGroupSize = MaxGroupSize(); - else - _currentGroupSize = currentGroupSize; - } - cl_uint CurrentGroupSize() {return _currentGroupSize;} - virtual cl_uint CurrentGroupNum(cl_uint threadCount) - { - if(threadCount == 0) - return 0; - if(LocalMemory()) - return 1; - return threadCount/CurrentGroupSize(); - } - cl_int Iterations() {return _iterations;} - std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();} + } + virtual bool SVMDataBufferAllSVMConsistent() { return false; } + bool UseSVM() { return _useSVM; } + void StartValue(HostDataType startValue) { _startValue = startValue; } + HostDataType StartValue() { return _startValue; } + void LocalMemory(bool local) { _localMemory = local; } + bool LocalMemory() { return _localMemory; } + void DeclaredInProgram(bool declaredInProgram) + { + _declaredInProgram = declaredInProgram; + } + bool DeclaredInProgram() { return _declaredInProgram; } + void UsedInFunction(bool local) { _usedInFunction = local; } + bool UsedInFunction() { return _usedInFunction; } + void GenericAddrSpace(bool genericAddrSpace) + { + _genericAddrSpace = genericAddrSpace; + } + bool GenericAddrSpace() { return _genericAddrSpace; } + void OldValueCheck(bool check) { _oldValueCheck = check; } + bool OldValueCheck() { return _oldValueCheck; } + void LocalRefValues(bool localRefValues) + { + _localRefValues = localRefValues; + } + bool LocalRefValues() { return _localRefValues; } + void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; } + cl_uint MaxGroupSize() { return _maxGroupSize; } + void CurrentGroupSize(cl_uint currentGroupSize) + { + if (MaxGroupSize() && MaxGroupSize() < currentGroupSize) + _currentGroupSize = MaxGroupSize(); + else + _currentGroupSize = currentGroupSize; + } + cl_uint CurrentGroupSize() { return _currentGroupSize; } + virtual cl_uint CurrentGroupNum(cl_uint threadCount) + { + if (threadCount == 0) return 0; + if (LocalMemory()) return 1; + return threadCount / CurrentGroupSize(); + } + cl_int Iterations() { return _iterations; } + std::string IterationsStr() + { + std::stringstream ss; + ss << _iterations; + return ss.str(); + } + private: - const TExplicitAtomicType _dataType; - const bool _useSVM; - HostDataType _startValue; - bool _localMemory; - bool _declaredInProgram; - bool _usedInFunction; - bool _genericAddrSpace; - bool _oldValueCheck; - bool _localRefValues; - cl_uint _maxGroupSize; - cl_uint _currentGroupSize; - cl_uint _passCount; - const cl_int _iterations; + const TExplicitAtomicType _dataType; + const bool _useSVM; + HostDataType _startValue; + bool _localMemory; + bool _declaredInProgram; + bool _usedInFunction; + bool _genericAddrSpace; + bool _oldValueCheck; + bool _localRefValues; + cl_uint _maxGroupSize; + cl_uint _currentGroupSize; + cl_uint _passCount; + const cl_int _iterations; }; -template<typename HostAtomicType, typename HostDataType> -class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestMemOrderScope + : public CBasicTest<HostAtomicType, HostDataType> { public: - using CBasicTest<HostAtomicType, HostDataType>::LocalMemory; - using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize; - using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; - CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM) - { - } - virtual std::string ProgramHeader(cl_uint maxNumDestItems) - { - std::string header; - if(gOldAPI) - { - std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s"; - header += - "#define atomic_store_explicit(x,y,o"+s+") atomic_store(x,y)\n" - "#define atomic_load_explicit(x,o"+s+") atomic_load(x)\n" - "#define atomic_exchange_explicit(x,y,o"+s+") atomic_exchange(x,y)\n" - "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n" - "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_weak(x,y,z)\n" - "#define atomic_fetch_add_explicit(x,y,o"+s+") atomic_fetch_add(x,y)\n" - "#define atomic_fetch_sub_explicit(x,y,o"+s+") atomic_fetch_sub(x,y)\n" - "#define atomic_fetch_or_explicit(x,y,o"+s+") atomic_fetch_or(x,y)\n" - "#define atomic_fetch_xor_explicit(x,y,o"+s+") atomic_fetch_xor(x,y)\n" - "#define atomic_fetch_and_explicit(x,y,o"+s+") atomic_fetch_and(x,y)\n" - "#define atomic_fetch_min_explicit(x,y,o"+s+") atomic_fetch_min(x,y)\n" - "#define atomic_fetch_max_explicit(x,y,o"+s+") atomic_fetch_max(x,y)\n" - "#define atomic_flag_test_and_set_explicit(x,o"+s+") atomic_flag_test_and_set(x)\n" - "#define atomic_flag_clear_explicit(x,o"+s+") atomic_flag_clear(x)\n"; - } - return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems); - } - virtual std::string SingleTestName() - { - std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); - if(MemoryOrder() != MEMORY_ORDER_EMPTY) - { - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); - } - if(MemoryScope() != MEMORY_SCOPE_EMPTY) - { - testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); - } - return testName; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(LocalMemory() && - MemoryScope() != MEMORY_SCOPE_EMPTY && - MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory - return 0; - if(MemoryScope() == MEMORY_SCOPE_DEVICE) - MaxGroupSize(16); // increase number of groups by forcing smaller group size - else - MaxGroupSize(0); // group size limited by device capabilities - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue); - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - // repeat test for each reasonable memory order/scope combination - std::vector<TExplicitMemoryOrderType> memoryOrder; - std::vector<TExplicitMemoryScopeType> memoryScope; - int error = 0; - - // For OpenCL-3.0 and later some orderings and scopes are optional, so here - // we query for the supported ones. - test_error_ret( - getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope), - "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); - - for(unsigned oi = 0; oi < memoryOrder.size(); oi++) - { - for(unsigned si = 0; si < memoryScope.size(); si++) - { - if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY) - continue; - MemoryOrder(memoryOrder[oi]); - MemoryScope(memoryScope[si]); - EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue))); - } - } - return error; - } - void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;} - TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;} - std::string MemoryOrderStr() - { - if(MemoryOrder() != MEMORY_ORDER_EMPTY) - return std::string(", ")+get_memory_order_type_name(MemoryOrder()); - return ""; - } - void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;} - TExplicitMemoryScopeType MemoryScope() {return _memoryScope;} - std::string MemoryScopeStr() - { - if(MemoryScope() != MEMORY_SCOPE_EMPTY) - return std::string(", ")+get_memory_scope_type_name(MemoryScope()); - return ""; - } - std::string MemoryOrderScopeStr() - { - return MemoryOrderStr()+MemoryScopeStr(); - } - virtual cl_uint CurrentGroupNum(cl_uint threadCount) - { - if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP) - return 1; - return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount); - } - virtual cl_uint MaxHostThreads() - { - // block host threads execution for memory scope different than - // memory_scope_all_svm_devices - if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES - || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost) - { - return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads(); - } - else - { - return 0; - } - } + using CBasicTest<HostAtomicType, HostDataType>::LocalMemory; + using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize; + using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; + CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) + : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM) + {} + virtual std::string ProgramHeader(cl_uint maxNumDestItems) + { + std::string header; + if (gOldAPI) + { + std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s"; + header += "#define atomic_store_explicit(x,y,o" + s + + ") atomic_store(x,y)\n" + "#define atomic_load_explicit(x,o" + + s + + ") atomic_load(x)\n" + "#define atomic_exchange_explicit(x,y,o" + + s + + ") atomic_exchange(x,y)\n" + "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of" + + s + + ") atomic_compare_exchange_strong(x,y,z)\n" + "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of" + + s + + ") atomic_compare_exchange_weak(x,y,z)\n" + "#define atomic_fetch_add_explicit(x,y,o" + + s + + ") atomic_fetch_add(x,y)\n" + "#define atomic_fetch_sub_explicit(x,y,o" + + s + + ") atomic_fetch_sub(x,y)\n" + "#define atomic_fetch_or_explicit(x,y,o" + + s + + ") atomic_fetch_or(x,y)\n" + "#define atomic_fetch_xor_explicit(x,y,o" + + s + + ") atomic_fetch_xor(x,y)\n" + "#define atomic_fetch_and_explicit(x,y,o" + + s + + ") atomic_fetch_and(x,y)\n" + "#define atomic_fetch_min_explicit(x,y,o" + + s + + ") atomic_fetch_min(x,y)\n" + "#define atomic_fetch_max_explicit(x,y,o" + + s + + ") atomic_fetch_max(x,y)\n" + "#define atomic_flag_test_and_set_explicit(x,o" + + s + + ") atomic_flag_test_and_set(x)\n" + "#define atomic_flag_clear_explicit(x,o" + + s + ") atomic_flag_clear(x)\n"; + } + return header + + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader( + maxNumDestItems); + } + virtual std::string SingleTestName() + { + std::string testName = + CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + { + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder())) + .substr(sizeof("memory")); + } + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + { + testName += std::string(", ") + + std::string(get_memory_scope_type_name(MemoryScope())) + .substr(sizeof("memory")); + } + return testName; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY + && MemoryScope() + != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used + // for global memory + return 0; + if (MemoryScope() == MEMORY_SCOPE_DEVICE) + MaxGroupSize( + 16); // increase number of groups by forcing smaller group size + else + MaxGroupSize(0); // group size limited by device capabilities + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest( + deviceID, context, queue); + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + // repeat test for each reasonable memory order/scope combination + std::vector<TExplicitMemoryOrderType> memoryOrder; + std::vector<TExplicitMemoryScopeType> memoryScope; + int error = 0; + + // For OpenCL-3.0 and later some orderings and scopes are optional, so + // here we query for the supported ones. + test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, + memoryScope), + "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); + + for (unsigned oi = 0; oi < memoryOrder.size(); oi++) + { + for (unsigned si = 0; si < memoryScope.size(); si++) + { + if (memoryOrder[oi] == MEMORY_ORDER_EMPTY + && memoryScope[si] != MEMORY_SCOPE_EMPTY) + continue; + MemoryOrder(memoryOrder[oi]); + MemoryScope(memoryScope[si]); + EXECUTE_TEST( + error, + (CBasicTest<HostAtomicType, HostDataType>:: + ExecuteForEachParameterSet(deviceID, context, queue))); + } + } + return error; + } + void MemoryOrder(TExplicitMemoryOrderType memoryOrder) + { + _memoryOrder = memoryOrder; + } + TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; } + std::string MemoryOrderStr() + { + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + return std::string(", ") + + get_memory_order_type_name(MemoryOrder()); + return ""; + } + void MemoryScope(TExplicitMemoryScopeType memoryScope) + { + _memoryScope = memoryScope; + } + TExplicitMemoryScopeType MemoryScope() { return _memoryScope; } + std::string MemoryScopeStr() + { + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + return std::string(", ") + + get_memory_scope_type_name(MemoryScope()); + return ""; + } + std::string MemoryOrderScopeStr() + { + return MemoryOrderStr() + MemoryScopeStr(); + } + virtual cl_uint CurrentGroupNum(cl_uint threadCount) + { + if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1; + return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum( + threadCount); + } + virtual cl_uint MaxHostThreads() + { + // block host threads execution for memory scope different than + // memory_scope_all_svm_devices + if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES + || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost) + { + return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads(); + } + else + { + return 0; + } + } + private: - TExplicitMemoryOrderType _memoryOrder; - TExplicitMemoryScopeType _memoryScope; + TExplicitMemoryOrderType _memoryOrder; + TExplicitMemoryScopeType _memoryScope; }; -template<typename HostAtomicType, typename HostDataType> -class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestMemOrder2Scope + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; - using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; - - CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - } - virtual std::string SingleTestName() - { - std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); - if(MemoryOrder() != MEMORY_ORDER_EMPTY) - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); - if(MemoryOrder2() != MEMORY_ORDER_EMPTY) - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory")); - if(MemoryScope() != MEMORY_SCOPE_EMPTY) - testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); - return testName; - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - // repeat test for each reasonable memory order/scope combination - std::vector<TExplicitMemoryOrderType> memoryOrder; - std::vector<TExplicitMemoryScopeType> memoryScope; - int error = 0; - - // For OpenCL-3.0 and later some orderings and scopes are optional, so here - // we query for the supported ones. - test_error_ret( - getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope), - "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); - - for(unsigned oi = 0; oi < memoryOrder.size(); oi++) - { - for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++) - { - for(unsigned si = 0; si < memoryScope.size(); si++) + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; + using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; + + CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + {} + virtual std::string SingleTestName() + { + std::string testName = + CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder())) + .substr(sizeof("memory")); + if (MemoryOrder2() != MEMORY_ORDER_EMPTY) + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder2())) + .substr(sizeof("memory")); + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + testName += std::string(", ") + + std::string(get_memory_scope_type_name(MemoryScope())) + .substr(sizeof("memory")); + return testName; + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + // repeat test for each reasonable memory order/scope combination + std::vector<TExplicitMemoryOrderType> memoryOrder; + std::vector<TExplicitMemoryScopeType> memoryScope; + int error = 0; + + // For OpenCL-3.0 and later some orderings and scopes are optional, so + // here we query for the supported ones. + test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, + memoryScope), + "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); + + for (unsigned oi = 0; oi < memoryOrder.size(); oi++) { - if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) - && memoryOrder[oi] != memoryOrder[o2i]) - continue; // both memory order arguments must be set (or none) - if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) - && memoryScope[si] != MEMORY_SCOPE_EMPTY) - continue; // memory scope without memory order is not allowed - MemoryOrder(memoryOrder[oi]); - MemoryOrder2(memoryOrder[o2i]); - MemoryScope(memoryScope[si]); - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) - == TEST_SKIPPED_ITSELF) - continue; // skip test - not applicable - - if (CheckCapabilities(MemoryScope(), MemoryOrder2()) - == TEST_SKIPPED_ITSELF) - continue; // skip test - not applicable - - EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue))); + for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++) + { + for (unsigned si = 0; si < memoryScope.size(); si++) + { + if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY + || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) + && memoryOrder[oi] != memoryOrder[o2i]) + continue; // both memory order arguments must be set (or + // none) + if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY + || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) + && memoryScope[si] != MEMORY_SCOPE_EMPTY) + continue; // memory scope without memory order is not + // allowed + MemoryOrder(memoryOrder[oi]); + MemoryOrder2(memoryOrder[o2i]); + MemoryScope(memoryScope[si]); + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + continue; // skip test - not applicable + + if (CheckCapabilities(MemoryScope(), MemoryOrder2()) + == TEST_SKIPPED_ITSELF) + continue; // skip test - not applicable + + EXECUTE_TEST(error, + (CBasicTest<HostAtomicType, HostDataType>:: + ExecuteForEachParameterSet( + deviceID, context, queue))); + } + } } - } - } - return error; - } - void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;} - TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;} - std::string MemoryOrderFailStr() - { - if(MemoryOrder2() != MEMORY_ORDER_EMPTY) - return std::string(", ")+get_memory_order_type_name(MemoryOrder2()); - return ""; - } - std::string MemoryOrderScope() - { - return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr(); - } + return error; + } + void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) + { + _memoryOrder2 = memoryOrderFail; + } + TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; } + std::string MemoryOrderFailStr() + { + if (MemoryOrder2() != MEMORY_ORDER_EMPTY) + return std::string(", ") + + get_memory_order_type_name(MemoryOrder2()); + return ""; + } + std::string MemoryOrderScope() + { + return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr(); + } + private: - TExplicitMemoryOrderType _memoryOrder2; + TExplicitMemoryOrderType _memoryOrder2; }; -template<typename HostAtomicType, typename HostDataType> -std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID) +template <typename HostAtomicType, typename HostDataType> +std::string +CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID) { - std::string pragma; - - if(gOldAPI) - { - pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"; - } - // Create the pragma lines for this kernel - if(DataType().Size(deviceID) == 8) - { - pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; - } - if(_dataType == TYPE_ATOMIC_DOUBLE) - pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - return pragma; + std::string pragma; + + if (gOldAPI) + { + pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : " + "enable\n"; + pragma += "#pragma OPENCL EXTENSION " + "cl_khr_local_int32_extended_atomics : enable\n"; + pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : " + "enable\n"; + pragma += "#pragma OPENCL EXTENSION " + "cl_khr_global_int32_extended_atomics : enable\n"; + } + // Create the pragma lines for this kernel + if (DataType().Size(deviceID) == 8) + { + pragma += + "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"; + pragma += + "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; + } + if (_dataType == TYPE_ATOMIC_DOUBLE) + pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + return pragma; } -template<typename HostAtomicType, typename HostDataType> -std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems) +template <typename HostAtomicType, typename HostDataType> +std::string +CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems) { - // Create the program header - std::string header; - std::string aTypeName = DataType().AtomicTypeName(); - std::string cTypeName = DataType().RegularTypeName(); - std::string argListForKernel; - std::string argListForFunction; - std::string argListNoTypes; - std::string functionPrototype; - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - - if(gOldAPI) - { - header += std::string("#define ")+aTypeName+" "+cTypeName+"\n" - "#define atomic_store(x,y) (*(x) = y)\n" - "#define atomic_load(x) (*(x))\n" - "#define ATOMIC_VAR_INIT(x) (x)\n" - "#define ATOMIC_FLAG_INIT 0\n" - "#define atomic_init(x,y) atomic_store(x,y)\n"; - if(aTypeName == "atomic_float") - header += "#define atomic_exchange(x,y) atomic_xchg(x,y)\n"; - else if(aTypeName == "atomic_double") - header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n" - "{\n" - " long tmp = *(long*)&y, res;\n" - " volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n" - " res = atom_xchg(tmpA,tmp);\n" - " return *(double*)&res;\n" - "}\n"; - else - header += "#define atomic_exchange(x,y) atom_xchg(x,y)\n"; - if(aTypeName != "atomic_float" && aTypeName != "atomic_double") - header += - "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n" - "{\n" - " "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n" - " if(old == *expected)\n" - " return true;\n" - " *expected = old;\n" - " return false;\n" - "}\n" - "#define atomic_compare_exchange_weak atomic_compare_exchange_strong\n"; - header += - "#define atomic_fetch_add(x,y) atom_add(x,y)\n" - "#define atomic_fetch_sub(x,y) atom_sub(x,y)\n" - "#define atomic_fetch_or(x,y) atom_or(x,y)\n" - "#define atomic_fetch_xor(x,y) atom_xor(x,y)\n" - "#define atomic_fetch_and(x,y) atom_and(x,y)\n" - "#define atomic_fetch_min(x,y) atom_min(x,y)\n" - "#define atomic_fetch_max(x,y) atom_max(x,y)\n" - "#define atomic_flag_test_and_set(x) atomic_exchange(x,1)\n" - "#define atomic_flag_clear(x) atomic_store(x,0)\n" - "\n"; - } - if(!LocalMemory() && DeclaredInProgram()) - { - // additional atomic variable for results copying (last thread will do this) - header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n"; - // atomic variables declared in program scope - test data - std::stringstream ss; - ss << maxNumDestItems; - header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n"; - ss.str(""); - ss << _startValue; - for(cl_uint i = 0; i < maxNumDestItems; i++) - { - if(aTypeName == "atomic_flag") - header += " ATOMIC_FLAG_INIT"; - else - header += " ATOMIC_VAR_INIT("+ss.str()+")"; - if(i+1 < maxNumDestItems) - header += ","; - header += "\n"; - } - header+= - "};\n" - "\n"; - } - return header; + // Create the program header + std::string header; + std::string aTypeName = DataType().AtomicTypeName(); + std::string cTypeName = DataType().RegularTypeName(); + std::string argListForKernel; + std::string argListForFunction; + std::string argListNoTypes; + std::string functionPrototype; + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + + if (gOldAPI) + { + header += std::string("#define ") + aTypeName + " " + cTypeName + + "\n" + "#define atomic_store(x,y) (*(x) " + "= y)\n" + "#define atomic_load(x) " + "(*(x))\n" + "#define ATOMIC_VAR_INIT(x) (x)\n" + "#define ATOMIC_FLAG_INIT 0\n" + "#define atomic_init(x,y) " + "atomic_store(x,y)\n"; + if (aTypeName == "atomic_float") + header += "#define atomic_exchange(x,y) " + " atomic_xchg(x,y)\n"; + else if (aTypeName == "atomic_double") + header += "double atomic_exchange(volatile " + addressSpace + + "atomic_double *x, double y)\n" + "{\n" + " long tmp = *(long*)&y, res;\n" + " volatile " + + addressSpace + "long *tmpA = (volatile " + addressSpace + + "long)x;\n" + " res = atom_xchg(tmpA,tmp);\n" + " return *(double*)&res;\n" + "}\n"; + else + header += "#define atomic_exchange(x,y) " + " atom_xchg(x,y)\n"; + if (aTypeName != "atomic_float" && aTypeName != "atomic_double") + header += "bool atomic_compare_exchange_strong(volatile " + + addressSpace + " " + aTypeName + " *a, " + cTypeName + + " *expected, " + cTypeName + + " desired)\n" + "{\n" + " " + + cTypeName + + " old = atom_cmpxchg(a, *expected, desired);\n" + " if(old == *expected)\n" + " return true;\n" + " *expected = old;\n" + " return false;\n" + "}\n" + "#define atomic_compare_exchange_weak " + "atomic_compare_exchange_strong\n"; + header += "#define atomic_fetch_add(x,y) " + "atom_add(x,y)\n" + "#define atomic_fetch_sub(x,y) " + "atom_sub(x,y)\n" + "#define atomic_fetch_or(x,y) " + "atom_or(x,y)\n" + "#define atomic_fetch_xor(x,y) " + "atom_xor(x,y)\n" + "#define atomic_fetch_and(x,y) " + "atom_and(x,y)\n" + "#define atomic_fetch_min(x,y) " + "atom_min(x,y)\n" + "#define atomic_fetch_max(x,y) " + "atom_max(x,y)\n" + "#define atomic_flag_test_and_set(x) " + "atomic_exchange(x,1)\n" + "#define atomic_flag_clear(x) " + "atomic_store(x,0)\n" + "\n"; + } + if (!LocalMemory() && DeclaredInProgram()) + { + // additional atomic variable for results copying (last thread will do + // this) + header += "__global volatile atomic_uint finishedThreads = " + "ATOMIC_VAR_INIT(0);\n"; + // atomic variables declared in program scope - test data + std::stringstream ss; + ss << maxNumDestItems; + header += std::string("__global volatile ") + aTypeName + " destMemory[" + + ss.str() + "] = {\n"; + ss.str(""); + ss << _startValue; + for (cl_uint i = 0; i < maxNumDestItems; i++) + { + if (aTypeName == "atomic_flag") + header += " ATOMIC_FLAG_INIT"; + else + header += " ATOMIC_VAR_INIT(" + ss.str() + ")"; + if (i + 1 < maxNumDestItems) header += ","; + header += "\n"; + } + header += "};\n" + "\n"; + } + return header; } -template<typename HostAtomicType, typename HostDataType> +template <typename HostAtomicType, typename HostDataType> std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode() { - if(!UsedInFunction()) - return ""; - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile "; - if(!GenericAddrSpace()) - code += addressSpace; - code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+ - " *oldValues"; - if(LocalRefValues()) - code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues"; - code += ")\n" - "{\n"; - code += ProgramCore(); - code += "}\n" - "\n"; - return code; + if (!UsedInFunction()) return ""; + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + std::string code = "void test_atomic_function(uint tid, uint threadCount, " + "uint numDestItems, volatile "; + if (!GenericAddrSpace()) code += addressSpace; + code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global " + + DataType().RegularTypeName() + " *oldValues"; + if (LocalRefValues()) + code += std::string(", __local ") + DataType().RegularTypeName() + + " *localValues"; + code += ")\n" + "{\n"; + code += ProgramCore(); + code += "}\n" + "\n"; + return code; } -template<typename HostAtomicType, typename HostDataType> -std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems) +template <typename HostAtomicType, typename HostDataType> +std::string +CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems) { - std::string aTypeName = DataType().AtomicTypeName(); - std::string cTypeName = DataType().RegularTypeName(); - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, "; - - // prepare list of arguments for kernel - if(LocalMemory()) - { - code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues," - " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory"; - } - else - { - code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+ - ", __global "+cTypeName+" *oldValues"; - } - if(LocalRefValues()) - code += std::string(", __local ")+cTypeName+" *localValues"; - code += ")\n" - "{\n"; - if(LocalMemory() && DeclaredInProgram()) - { - // local atomics declared in kernel scope - std::stringstream ss; - ss << maxNumDestItems; - code += std::string(" __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n"; - } - code += " uint tid = get_global_id(0);\n" - "\n"; - if(LocalMemory()) - { - // memory_order_relaxed is sufficient for these initialization operations - // as the barrier below will act as a fence, providing an order to the - // operations. memory_scope_work_group is sufficient as local memory is - // only visible within the work-group. - code += R"( + std::string aTypeName = DataType().AtomicTypeName(); + std::string cTypeName = DataType().RegularTypeName(); + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + std::string code = "__kernel void test_atomic_kernel(uint threadCount, " + "uint numDestItems, "; + + // prepare list of arguments for kernel + if (LocalMemory()) + { + code += std::string("__global ") + cTypeName + " *finalDest, __global " + + cTypeName + + " *oldValues," + " volatile " + + addressSpace + aTypeName + " *" + + (DeclaredInProgram() ? "notUsed" : "") + "destMemory"; + } + else + { + code += "volatile " + addressSpace + + (DeclaredInProgram() ? (cTypeName + " *finalDest") + : (aTypeName + " *destMemory")) + + ", __global " + cTypeName + " *oldValues"; + } + if (LocalRefValues()) + code += std::string(", __local ") + cTypeName + " *localValues"; + code += ")\n" + "{\n"; + if (LocalMemory() && DeclaredInProgram()) + { + // local atomics declared in kernel scope + std::stringstream ss; + ss << maxNumDestItems; + code += std::string(" __local volatile ") + aTypeName + " destMemory[" + + ss.str() + "];\n"; + } + code += " uint tid = get_global_id(0);\n" + "\n"; + if (LocalMemory()) + { + // memory_order_relaxed is sufficient for these initialization + // operations as the barrier below will act as a fence, providing an + // order to the operations. memory_scope_work_group is sufficient as + // local memory is only visible within the work-group. + code += R"( // initialize atomics not reachable from host (first thread // is doing this, other threads are waiting on barrier) if(get_local_id(0) == 0) for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++) {)"; - if (aTypeName == "atomic_flag") - { - code += R"( + if (aTypeName == "atomic_flag") + { + code += R"( if(finalDest[dstItemIdx]) atomic_flag_test_and_set_explicit(destMemory+dstItemIdx, memory_order_relaxed, @@ -823,512 +976,595 @@ std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumD atomic_flag_clear_explicit(destMemory+dstItemIdx, memory_order_relaxed, memory_scope_work_group);)"; - } - else - { - code += R"( + } + else + { + code += R"( atomic_store_explicit(destMemory+dstItemIdx, finalDest[dstItemIdx], memory_order_relaxed, memory_scope_work_group);)"; + } + code += " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n"; } - code += - " }\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n"; - } - if (LocalRefValues()) - { - code += - " // Copy input reference values into local memory\n"; - if (NumNonAtomicVariablesPerThread() == 1) - code += " localValues[get_local_id(0)] = oldValues[tid];\n"; - else + if (LocalRefValues()) { - std::stringstream ss; - ss << NumNonAtomicVariablesPerThread(); - code += - " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" - " localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n"; - } - code += - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n"; - } - if (UsedInFunction()) - code += std::string(" test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+ - (LocalRefValues() ? ", localValues" : "")+");\n"; - else - code += ProgramCore(); - code += "\n"; - if (LocalRefValues()) - { - code += - " // Copy local reference values into output array\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n"; - if (NumNonAtomicVariablesPerThread() == 1) - code += " oldValues[tid] = localValues[get_local_id(0)];\n"; + code += " // Copy input reference values into local memory\n"; + if (NumNonAtomicVariablesPerThread() == 1) + code += " localValues[get_local_id(0)] = oldValues[tid];\n"; + else + { + std::stringstream ss; + ss << NumNonAtomicVariablesPerThread(); + code += " for(uint rfId = 0; rfId < " + ss.str() + + "; rfId++)\n" + " localValues[get_local_id(0)*" + + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n"; + } + code += " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n"; + } + if (UsedInFunction()) + code += std::string(" test_atomic_function(tid, threadCount, " + "numDestItems, destMemory, oldValues") + + (LocalRefValues() ? ", localValues" : "") + ");\n"; else + code += ProgramCore(); + code += "\n"; + if (LocalRefValues()) { - std::stringstream ss; - ss << NumNonAtomicVariablesPerThread(); - code += - " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" - " oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n"; + code += " // Copy local reference values into output array\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n"; + if (NumNonAtomicVariablesPerThread() == 1) + code += " oldValues[tid] = localValues[get_local_id(0)];\n"; + else + { + std::stringstream ss; + ss << NumNonAtomicVariablesPerThread(); + code += " for(uint rfId = 0; rfId < " + ss.str() + + "; rfId++)\n" + " oldValues[tid*" + + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + + "+rfId];\n"; + } + code += "\n"; } - code += "\n"; - } - if(LocalMemory() || DeclaredInProgram()) - { - code += " // Copy final values to host reachable buffer\n"; - if(LocalMemory()) - code += - " barrier(CLK_LOCAL_MEM_FENCE);\n" - " if(get_local_id(0) == 0) // first thread in workgroup\n"; - else - // global atomics declared in program scope - code += R"( - if(atomic_fetch_add_explicit(&finishedThreads, 1u, - memory_order_relaxed, - memory_scope_work_group) - == get_global_size(0)-1) // last finished thread - )"; - code += - " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n"; - if(aTypeName == "atomic_flag") + if (LocalMemory()) { - code += R"( + code += " // Copy final values to host reachable buffer\n"; + code += " barrier(CLK_LOCAL_MEM_FENCE);\n" + " if(get_local_id(0) == 0) // first thread in workgroup\n"; + code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; " + "dstItemIdx++)\n"; + if (aTypeName == "atomic_flag") + { + code += R"( finalDest[dstItemIdx] = atomic_flag_test_and_set_explicit(destMemory+dstItemIdx, memory_order_relaxed, memory_scope_work_group);)"; + } + else + { + code += R"( + finalDest[dstItemIdx] = + atomic_load_explicit(destMemory+dstItemIdx, + memory_order_relaxed, + memory_scope_work_group);)"; + } } - else + else if (DeclaredInProgram()) { + // global atomics declared in program scope + code += " // Copy final values to host reachable buffer\n"; code += R"( + if(atomic_fetch_add_explicit(&finishedThreads, 1u, + memory_order_acq_rel, + memory_scope_device) + == get_global_size(0)-1) // last finished thread + )"; + code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; " + "dstItemIdx++)\n"; + if (aTypeName == "atomic_flag") + { + code += R"( + finalDest[dstItemIdx] = + atomic_flag_test_and_set_explicit(destMemory+dstItemIdx, + memory_order_relaxed, + memory_scope_device);)"; + } + else + { + code += R"( finalDest[dstItemIdx] = atomic_load_explicit(destMemory+dstItemIdx, memory_order_relaxed, - memory_scope_work_group);)"; + memory_scope_device);)"; + } } - } - code += "}\n" - "\n"; - return code; + code += "}\n" + "\n"; + return code; } template <typename HostAtomicType, typename HostDataType> -int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) +int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest( + cl_device_id deviceID, cl_context context, cl_command_queue queue) { - int error; - clProgramWrapper program; - clKernelWrapper kernel; - size_t threadNum[1]; - clMemWrapper streams[2]; - std::vector<HostAtomicType> destItems; - HostAtomicType *svmAtomicBuffer = 0; - std::vector<HostDataType> refValues, startRefValues; - HostDataType *svmDataBuffer = 0; - cl_uint deviceThreadCount, hostThreadCount, threadCount; - size_t groupSize = 0; - std::string programSource; - const char *programLine; - MTdata d; - size_t typeSize = DataType().Size(deviceID); - - deviceThreadCount = _maxDeviceThreads; - hostThreadCount = MaxHostThreads(); - threadCount = deviceThreadCount+hostThreadCount; - - //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str()); - log_info("\t%s...\n", SingleTestName().c_str()); - - if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables - { - log_info("\t\tTest disabled\n"); - return 0; - } - if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace) - { - log_info("\t\tTest disabled\n"); - return 0; - } - - // set up work sizes based on device capabilities and test configuration - error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL); - test_error(error, "Unable to obtain max work group size for device"); - CurrentGroupSize((cl_uint)groupSize); - if(CurrentGroupSize() > deviceThreadCount) - CurrentGroupSize(deviceThreadCount); - if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) - deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); - threadCount = deviceThreadCount+hostThreadCount; - - // If we're given a num_results function, we need to determine how many result objects we need. - // This is the first assessment for current maximum number of threads (exact thread count is not known here) - // - needed for program source code generation (arrays of atomics declared in program) - cl_uint numDestItems = NumResults(threadCount, deviceID); - - if(deviceThreadCount > 0) - { - // This loop iteratively reduces the workgroup size by 2 and then - // re-generates the kernel with the reduced - // workgroup size until we find a size which is admissible for the kernel - // being run or reduce the wg size - // to the trivial case of 1 (which was separately verified to be accurate - // for the kernel being run) - - while ((CurrentGroupSize() > 1)) - { - // Re-generate the kernel code with the current group size - if (kernel) clReleaseKernel(kernel); - if (program) clReleaseProgram(program); - programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems) - + FunctionCode() + KernelCode(numDestItems); - programLine = programSource.c_str(); - if (create_single_kernel_helper_with_build_options( - context, &program, &kernel, 1, &programLine, - "test_atomic_kernel", gOldAPI ? "" : nullptr)) - { - return -1; - } - // Get work group size for the new kernel - error = clGetKernelWorkGroupInfo(kernel, deviceID, - CL_KERNEL_WORK_GROUP_SIZE, - sizeof(groupSize), &groupSize, NULL); - test_error(error, - "Unable to obtain max work group size for device and " - "kernel combo"); - - if (LocalMemory()) - { - cl_ulong usedLocalMemory; - cl_ulong totalLocalMemory; - cl_uint maxWorkGroupSize; - - error = clGetKernelWorkGroupInfo( - kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, - sizeof(usedLocalMemory), &usedLocalMemory, NULL); - test_error(error, "clGetKernelWorkGroupInfo failed"); - - error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, - sizeof(totalLocalMemory), - &totalLocalMemory, NULL); - test_error(error, "clGetDeviceInfo failed"); - - // We know that each work-group is going to use typeSize * - // deviceThreadCount bytes of local memory - // so pick the maximum value for deviceThreadCount that uses all - // the local memory. - maxWorkGroupSize = - ((totalLocalMemory - usedLocalMemory) / typeSize); - - if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize; - } - if (CurrentGroupSize() <= groupSize) - break; - else - CurrentGroupSize(CurrentGroupSize() / 2); - } - if(CurrentGroupSize() > deviceThreadCount) - CurrentGroupSize(deviceThreadCount); - if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) - deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); - threadCount = deviceThreadCount+hostThreadCount; - } - if (gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - if(deviceThreadCount > 0) - log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize()); - if(hostThreadCount > 0) - log_info("\t\t(host threads %u)\n", hostThreadCount); - - refValues.resize(threadCount*NumNonAtomicVariablesPerThread()); - - // Generate ref data if we have a ref generator provided - d = init_genrand(gRandomSeed); - startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread()); - if(GenerateRefs(threadCount, &startRefValues[0], d)) - { - //copy ref values for host threads - memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread()); - } - else - { - startRefValues.resize(0); - } - free_mtdata(d); - d = NULL; - - // If we're given a num_results function, we need to determine how many result objects we need. If - // we don't have it, we assume it's just 1 - // This is final value (exact thread count is known in this place) - numDestItems = NumResults(threadCount, deviceID); - - destItems.resize(numDestItems); - for(cl_uint i = 0; i < numDestItems; i++) - destItems[i] = _startValue; - - // Create main buffer with atomic variables (array size dependent on particular test) - if(UseSVM()) - { - if(gUseHostPtr) - svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems); - else - svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0); - if(!svmAtomicBuffer) - { - log_error("ERROR: clSVMAlloc failed!\n"); - return -1; - } - memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems); - streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, - typeSize * numDestItems, svmAtomicBuffer, NULL); - } - else - { - streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, - typeSize * numDestItems, &destItems[0], NULL); - } - if (!streams[0]) - { - log_error("ERROR: Creating output array failed!\n"); - return -1; - } - // Create buffer for per-thread input/output data - if(UseSVM()) - { - if(gUseHostPtr) - svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread()); - else - svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0); - if(!svmDataBuffer) - { - log_error("ERROR: clSVMAlloc failed!\n"); - return -1; - } - if(startRefValues.size()) - memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread()); - streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, - typeSize * threadCount - * NumNonAtomicVariablesPerThread(), - svmDataBuffer, NULL); - } - else - { - streams[1] = clCreateBuffer( - context, - ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)), - typeSize * threadCount * NumNonAtomicVariablesPerThread(), - startRefValues.size() ? &startRefValues[0] : 0, NULL); - } - if (!streams[1]) - { - log_error("ERROR: Creating reference array failed!\n"); - return -1; - } - if(deviceThreadCount > 0) - { - cl_uint argInd = 0; - /* Set the arguments */ - error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount); - test_error(error, "Unable to set kernel argument"); - error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems); - test_error(error, "Unable to set indexed kernel argument"); - error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]); - test_error(error, "Unable to set indexed kernel arguments"); - error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]); - test_error(error, "Unable to set indexed kernel arguments"); - if(LocalMemory()) - { - error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL); - test_error(error, "Unable to set indexed local kernel argument"); - } - if(LocalRefValues()) - { - error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL); - test_error(error, "Unable to set indexed kernel argument"); - } - } - /* Configure host threads */ - std::vector<THostThreadContext> hostThreadContexts(hostThreadCount); - for(unsigned int t = 0; t < hostThreadCount; t++) - { - hostThreadContexts[t].test = this; - hostThreadContexts[t].tid = deviceThreadCount+t; - hostThreadContexts[t].threadCount = threadCount; - hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0]; - hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0]; - } - - if(deviceThreadCount > 0) - { - /* Run the kernel */ - threadNum[0] = deviceThreadCount; - groupSize = CurrentGroupSize(); - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL); - test_error(error, "Unable to execute test kernel"); - /* start device threads */ - error = clFlush(queue); - test_error(error, "clFlush failed"); - } - - /* Start host threads and wait for finish */ - if(hostThreadCount > 0) - ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]); - - if(UseSVM()) - { - error = clFinish(queue); - test_error(error, "clFinish failed"); - memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems); - memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread()); - } - else - { - if(deviceThreadCount > 0) - { - error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); - test_error(error, "Unable to read result value!"); - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL); - test_error(error, "Unable to read reference values!"); - } - } - bool dataVerified = false; - // If we have an expectedFn, then we need to generate a final value to compare against. If we don't - // have one, it's because we're comparing ref values only - for(cl_uint i = 0; i < numDestItems; i++) - { - HostDataType expected; - - if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i)) - break; // no expected value function provided - - if(expected != destItems[i]) - { - std::stringstream logLine; - logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n"; - log_error("%s", logLine.str().c_str()); - for(i = 0; i < threadCount; i++) - { - logLine.str(""); - logLine << " --- " << i << " - "; - if(startRefValues.size()) - logLine << startRefValues[i] << " -> " << refValues[i]; - else - logLine << refValues[i]; - logLine << " --- "; - if(i < numDestItems) - logLine << destItems[i]; - logLine << "\n"; - log_info("%s", logLine.str().c_str()); - } - if(!gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - return -1; - } - dataVerified = true; - } - - bool dataCorrect = false; - /* Use the verify function (if provided) to also check the results */ - if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0])) - { - if(!dataCorrect) - { - log_error("ERROR: Reference values did not validate!\n"); - std::stringstream logLine; - for(cl_uint i = 0; i < threadCount; i++) - for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++) - { - logLine.str(""); - logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- "; - if(j == 0 && i < numDestItems) - logLine << destItems[i]; - logLine << "\n"; - log_info("%s", logLine.str().c_str()); - } - if(!gDebug) - { + int error; + clProgramWrapper program; + clKernelWrapper kernel; + size_t threadNum[1]; + clMemWrapper streams[2]; + std::vector<HostAtomicType> destItems; + HostAtomicType *svmAtomicBuffer = 0; + std::vector<HostDataType> refValues, startRefValues; + HostDataType *svmDataBuffer = 0; + cl_uint deviceThreadCount, hostThreadCount, threadCount; + size_t groupSize = 0; + std::string programSource; + const char *programLine; + MTdata d; + size_t typeSize = DataType().Size(deviceID); + + deviceThreadCount = _maxDeviceThreads; + hostThreadCount = MaxHostThreads(); + threadCount = deviceThreadCount + hostThreadCount; + + // log_info("\t%s %s%s...\n", local ? "local" : "global", + // DataType().AtomicTypeName(), memoryOrderScope.c_str()); + log_info("\t%s...\n", SingleTestName().c_str()); + + if (!LocalMemory() && DeclaredInProgram() + && gNoGlobalVariables) // no support for program scope global variables + { + log_info("\t\tTest disabled\n"); + return 0; + } + if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace) + { + log_info("\t\tTest disabled\n"); + return 0; + } + if (!LocalMemory() && DeclaredInProgram()) + { + if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0) + || ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)) + { + log_info("\t\tTest disabled\n"); + return 0; + } + } + + // set up work sizes based on device capabilities and test configuration + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(groupSize), &groupSize, NULL); + test_error(error, "Unable to obtain max work group size for device"); + CurrentGroupSize((cl_uint)groupSize); + if (CurrentGroupSize() > deviceThreadCount) + CurrentGroupSize(deviceThreadCount); + if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) + deviceThreadCount = + CurrentGroupSize() * CurrentGroupNum(deviceThreadCount); + threadCount = deviceThreadCount + hostThreadCount; + + // If we're given a num_results function, we need to determine how many + // result objects we need. This is the first assessment for current maximum + // number of threads (exact thread count is not known here) + // - needed for program source code generation (arrays of atomics declared + // in program) + cl_uint numDestItems = NumResults(threadCount, deviceID); + + if (deviceThreadCount > 0) + { + // This loop iteratively reduces the workgroup size by 2 and then + // re-generates the kernel with the reduced + // workgroup size until we find a size which is admissible for the + // kernel being run or reduce the wg size to the trivial case of 1 + // (which was separately verified to be accurate for the kernel being + // run) + + while ((CurrentGroupSize() > 1)) + { + // Re-generate the kernel code with the current group size + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems) + + FunctionCode() + KernelCode(numDestItems); + programLine = programSource.c_str(); + if (create_single_kernel_helper_with_build_options( + context, &program, &kernel, 1, &programLine, + "test_atomic_kernel", gOldAPI ? "" : nullptr)) + { + return -1; + } + // Get work group size for the new kernel + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), + &groupSize, NULL); + test_error(error, + "Unable to obtain max work group size for device and " + "kernel combo"); + + if (LocalMemory()) + { + cl_ulong usedLocalMemory; + cl_ulong totalLocalMemory; + cl_uint maxWorkGroupSize; + + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(usedLocalMemory), &usedLocalMemory, NULL); + test_error(error, "clGetKernelWorkGroupInfo failed"); + + error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(totalLocalMemory), + &totalLocalMemory, NULL); + test_error(error, "clGetDeviceInfo failed"); + + // We know that each work-group is going to use typeSize * + // deviceThreadCount bytes of local memory + // so pick the maximum value for deviceThreadCount that uses all + // the local memory. + maxWorkGroupSize = + ((totalLocalMemory - usedLocalMemory) / typeSize); + + if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize; + } + if (CurrentGroupSize() <= groupSize) + break; + else + CurrentGroupSize(CurrentGroupSize() / 2); + } + if (CurrentGroupSize() > deviceThreadCount) + CurrentGroupSize(deviceThreadCount); + if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) + deviceThreadCount = + CurrentGroupSize() * CurrentGroupNum(deviceThreadCount); + threadCount = deviceThreadCount + hostThreadCount; + } + if (gDebug) + { log_info("Program source:\n"); log_info("%s\n", programLine); - } - return -1; - } - } - else if(!dataVerified) - { - log_error("ERROR: Test doesn't check total or refs; no values are verified!\n"); - return -1; - } - - if(OldValueCheck() && - !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics - // 'old' value has been overwritten by previous clEnqueueNDRangeKernel - { - /* Re-write the starting value */ - for(size_t i = 0; i < numDestItems; i++) - destItems[i] = _startValue; - refValues[0] = 0; - if(deviceThreadCount > 0) - { - error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); - test_error(error, "Unable to write starting values!"); - - /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */ - threadNum[0] = 1; - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL); - test_error(error, "Unable to execute test kernel"); - - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL); - test_error(error, "Unable to read reference values!"); + } + if (deviceThreadCount > 0) + log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, + CurrentGroupSize()); + if (hostThreadCount > 0) + log_info("\t\t(host threads %u)\n", hostThreadCount); + + refValues.resize(threadCount * NumNonAtomicVariablesPerThread()); + + // Generate ref data if we have a ref generator provided + d = init_genrand(gRandomSeed); + startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread()); + if (GenerateRefs(threadCount, &startRefValues[0], d)) + { + // copy ref values for host threads + memcpy(&refValues[0], &startRefValues[0], + sizeof(HostDataType) * threadCount + * NumNonAtomicVariablesPerThread()); } else { - /* Start host thread */ - HostFunction(0, 1, &destItems[0], &refValues[0]); + startRefValues.resize(0); } + free_mtdata(d); + d = NULL; - if(refValues[0] != _startValue)//destItems[0]) + // If we're given a num_results function, we need to determine how many + // result objects we need. If we don't have it, we assume it's just 1 This + // is final value (exact thread count is known in this place) + numDestItems = NumResults(threadCount, deviceID); + + destItems.resize(numDestItems); + for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue; + + // Create main buffer with atomic variables (array size dependent on + // particular test) + if (UseSVM()) { - std::stringstream logLine; - logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value " - " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n"; - log_error("%s", logLine.str().c_str()); - if(!gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - return -1; - } - } - if(UseSVM()) - { - // the buffer object must first be released before the SVM buffer is freed - error = clReleaseMemObject(streams[0]); - streams[0] = 0; - test_error(error, "clReleaseMemObject failed"); - if(gUseHostPtr) - free(svmAtomicBuffer); + if (gUseHostPtr) + svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems); + else + svmAtomicBuffer = (HostAtomicType *)clSVMAlloc( + context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, + typeSize * numDestItems, 0); + if (!svmAtomicBuffer) + { + log_error("ERROR: clSVMAlloc failed!\n"); + return -1; + } + memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems); + streams[0] = + clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + typeSize * numDestItems, svmAtomicBuffer, NULL); + } else - clSVMFree(context, svmAtomicBuffer); - error = clReleaseMemObject(streams[1]); - streams[1] = 0; - test_error(error, "clReleaseMemObject failed"); - if(gUseHostPtr) - free(svmDataBuffer); + { + streams[0] = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + typeSize * numDestItems, &destItems[0], NULL); + } + if (!streams[0]) + { + log_error("ERROR: Creating output array failed!\n"); + return -1; + } + // Create buffer for per-thread input/output data + if (UseSVM()) + { + if (gUseHostPtr) + svmDataBuffer = (HostDataType *)malloc( + typeSize * threadCount * NumNonAtomicVariablesPerThread()); + else + svmDataBuffer = (HostDataType *)clSVMAlloc( + context, + CL_MEM_SVM_FINE_GRAIN_BUFFER + | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS + : 0), + typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0); + if (!svmDataBuffer) + { + log_error("ERROR: clSVMAlloc failed!\n"); + return -1; + } + if (startRefValues.size()) + memcpy(svmDataBuffer, &startRefValues[0], + typeSize * threadCount * NumNonAtomicVariablesPerThread()); + streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + typeSize * threadCount + * NumNonAtomicVariablesPerThread(), + svmDataBuffer, NULL); + } else - clSVMFree(context, svmDataBuffer); - } - _passCount++; - return 0; + { + streams[1] = clCreateBuffer( + context, + ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR + : CL_MEM_READ_WRITE)), + typeSize * threadCount * NumNonAtomicVariablesPerThread(), + startRefValues.size() ? &startRefValues[0] : 0, NULL); + } + if (!streams[1]) + { + log_error("ERROR: Creating reference array failed!\n"); + return -1; + } + if (deviceThreadCount > 0) + { + cl_uint argInd = 0; + /* Set the arguments */ + error = + clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), + &numDestItems); + test_error(error, "Unable to set indexed kernel argument"); + error = + clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set indexed kernel arguments"); + error = + clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set indexed kernel arguments"); + if (LocalMemory()) + { + error = + clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL); + test_error(error, "Unable to set indexed local kernel argument"); + } + if (LocalRefValues()) + { + error = + clSetKernelArg(kernel, argInd++, + LocalRefValues() ? typeSize + * (CurrentGroupSize() + * NumNonAtomicVariablesPerThread()) + : 1, + NULL); + test_error(error, "Unable to set indexed kernel argument"); + } + } + /* Configure host threads */ + std::vector<THostThreadContext> hostThreadContexts(hostThreadCount); + for (unsigned int t = 0; t < hostThreadCount; t++) + { + hostThreadContexts[t].test = this; + hostThreadContexts[t].tid = deviceThreadCount + t; + hostThreadContexts[t].threadCount = threadCount; + hostThreadContexts[t].destMemory = + UseSVM() ? svmAtomicBuffer : &destItems[0]; + hostThreadContexts[t].oldValues = + UseSVM() ? svmDataBuffer : &refValues[0]; + } + + if (deviceThreadCount > 0) + { + /* Run the kernel */ + threadNum[0] = deviceThreadCount; + groupSize = CurrentGroupSize(); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, + &groupSize, 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + /* start device threads */ + error = clFlush(queue); + test_error(error, "clFlush failed"); + } + + /* Start host threads and wait for finish */ + if (hostThreadCount > 0) + ThreadPool_Do(HostThreadFunction, hostThreadCount, + &hostThreadContexts[0]); + + if (UseSVM()) + { + error = clFinish(queue); + test_error(error, "clFinish failed"); + memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems); + memcpy(&refValues[0], svmDataBuffer, + typeSize * threadCount * NumNonAtomicVariablesPerThread()); + } + else + { + if (deviceThreadCount > 0) + { + error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, + typeSize * numDestItems, &destItems[0], + 0, NULL, NULL); + test_error(error, "Unable to read result value!"); + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, + typeSize * deviceThreadCount + * NumNonAtomicVariablesPerThread(), + &refValues[0], 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); + } + } + bool dataVerified = false; + // If we have an expectedFn, then we need to generate a final value to + // compare against. If we don't have one, it's because we're comparing ref + // values only + for (cl_uint i = 0; i < numDestItems; i++) + { + HostDataType expected; + + if (!ExpectedValue(expected, threadCount, + startRefValues.size() ? &startRefValues[0] : 0, i)) + break; // no expected value function provided + + if (expected != destItems[i]) + { + std::stringstream logLine; + logLine << "ERROR: Result " << i + << " from kernel does not validate! (should be " << expected + << ", was " << destItems[i] << ")\n"; + log_error("%s", logLine.str().c_str()); + for (i = 0; i < threadCount; i++) + { + logLine.str(""); + logLine << " --- " << i << " - "; + if (startRefValues.size()) + logLine << startRefValues[i] << " -> " << refValues[i]; + else + logLine << refValues[i]; + logLine << " --- "; + if (i < numDestItems) logLine << destItems[i]; + logLine << "\n"; + log_info("%s", logLine.str().c_str()); + } + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + dataVerified = true; + } + + bool dataCorrect = false; + /* Use the verify function (if provided) to also check the results */ + if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0])) + { + if (!dataCorrect) + { + log_error("ERROR: Reference values did not validate!\n"); + std::stringstream logLine; + for (cl_uint i = 0; i < threadCount; i++) + for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++) + { + logLine.str(""); + logLine + << " --- " << i << " - " + << refValues[i * NumNonAtomicVariablesPerThread() + j] + << " --- "; + if (j == 0 && i < numDestItems) logLine << destItems[i]; + logLine << "\n"; + log_info("%s", logLine.str().c_str()); + } + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + } + else if (!dataVerified) + { + log_error("ERROR: Test doesn't check total or refs; no values are " + "verified!\n"); + return -1; + } + + if (OldValueCheck() + && !(DeclaredInProgram() + && !LocalMemory())) // don't test for programs scope global atomics + // 'old' value has been overwritten by previous + // clEnqueueNDRangeKernel + { + /* Re-write the starting value */ + for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue; + refValues[0] = 0; + if (deviceThreadCount > 0) + { + error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, + typeSize * numDestItems, &destItems[0], + 0, NULL, NULL); + test_error(error, "Unable to write starting values!"); + + /* Run the kernel once for a single thread, so we can verify that + * the returned value is the original one */ + threadNum[0] = 1; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, + threadNum, 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, + &refValues[0], 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); + } + else + { + /* Start host thread */ + HostFunction(0, 1, &destItems[0], &refValues[0]); + } + + if (refValues[0] != _startValue) // destItems[0]) + { + std::stringstream logLine; + logLine << "ERROR: atomic function operated correctly but did NOT " + "return correct 'old' value " + " (should have been " + << destItems[0] << ", returned " << refValues[0] << ")!\n"; + log_error("%s", logLine.str().c_str()); + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + } + if (UseSVM()) + { + // the buffer object must first be released before the SVM buffer is + // freed. The Wrapper Class method reset() will do that + streams[0].reset(); + if (gUseHostPtr) + free(svmAtomicBuffer); + else + clSVMFree(context, svmAtomicBuffer); + streams[1].reset(); + if (gUseHostPtr) + free(svmDataBuffer); + else + clSVMFree(context, svmDataBuffer); + } + _passCount++; + return 0; } #endif //_COMMON_H_ diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp index c3a190b7..09c14ed1 100644 --- a/test_conformance/c11_atomics/test_atomics.cpp +++ b/test_conformance/c11_atomics/test_atomics.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -23,2188 +23,3209 @@ #include <sstream> #include <vector> -template<typename HostAtomicType, typename HostDataType> -class CBasicTestStore : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestStore + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; - CBasicTestStore(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - OldValueCheck(false); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return threadCount; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(MemoryOrder() == MEMORY_ORDER_ACQUIRE || - MemoryOrder() == MEMORY_ORDER_ACQ_REL) - return 0; //skip test - not applicable - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " atomic_store"+postfix+"(&destMemory[tid], tid"+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - host_atomic_store(&destMemory[tid], (HostDataType)tid, MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = (HostDataType)whichDestValue; - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; + CBasicTestStore(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + OldValueCheck(false); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return threadCount; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (MemoryOrder() == MEMORY_ORDER_ACQUIRE + || MemoryOrder() == MEMORY_ORDER_ACQ_REL) + return 0; // skip test - not applicable + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " atomic_store" + postfix + "(&destMemory[tid], tid" + + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + host_atomic_store(&destMemory[tid], (HostDataType)tid, MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = (HostDataType)whichDestValue; + return true; + } }; -int test_atomic_store_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_store_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestStore<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, useSVM); - EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(TYPE_ATOMIC_DOUBLE, useSVM); - EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestStore<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestStore<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestStore<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestStore<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, + useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, + useSVM); + EXECUTE_TEST(error, + test_float.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double( + TYPE_ATOMIC_DOUBLE, useSVM); + EXECUTE_TEST(error, + test_double.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestStore<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestStore<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestStore<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_store(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_store(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_store_generic(deviceID, context, queue, num_elements, false); + return test_atomic_store_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_store(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_store(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_store_generic(deviceID, context, queue, num_elements, true); + return test_atomic_store_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestInit : public CBasicTest<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestInit : public CBasicTest<HostAtomicType, HostDataType> { public: - using CBasicTest<HostAtomicType, HostDataType>::OldValueCheck; - CBasicTestInit(TExplicitAtomicType dataType, bool useSVM) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM) - { - OldValueCheck(false); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return threadCount; - } - virtual std::string ProgramCore() - { - return - " atomic_init(&destMemory[tid], tid);\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - host_atomic_init(&destMemory[tid], (HostDataType)tid); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = (HostDataType)whichDestValue; - return true; - } + using CBasicTest<HostAtomicType, HostDataType>::OldValueCheck; + CBasicTestInit(TExplicitAtomicType dataType, bool useSVM) + : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM) + { + OldValueCheck(false); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return threadCount; + } + virtual std::string ProgramCore() + { + return " atomic_init(&destMemory[tid], tid);\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + host_atomic_init(&destMemory[tid], (HostDataType)tid); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = (HostDataType)whichDestValue; + return true; + } }; -int test_atomic_init_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_init_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestInit<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, useSVM); - EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(TYPE_ATOMIC_DOUBLE, useSVM); - EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestInit<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestInit<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestInit<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestInit<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, + useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, + useSVM); + EXECUTE_TEST(error, + test_float.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double( + TYPE_ATOMIC_DOUBLE, useSVM); + EXECUTE_TEST(error, + test_double.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestInit<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestInit<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestInit<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_init(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_init(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_init_generic(deviceID, context, queue, num_elements, false); + return test_atomic_init_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_init(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_init(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_init_generic(deviceID, context, queue, num_elements, true); + return test_atomic_init_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestLoad : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestLoad + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; - using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; - CBasicTestLoad(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - OldValueCheck(false); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return threadCount; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(MemoryOrder() == MEMORY_ORDER_RELEASE || - MemoryOrder() == MEMORY_ORDER_ACQ_REL) - return 0; //skip test - not applicable - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue); - } - virtual std::string ProgramCore() - { - // In the case this test is run with MEMORY_ORDER_ACQUIRE, the store - // should be MEMORY_ORDER_RELEASE - std::string memoryOrderScopeLoad = MemoryOrderScopeStr(); - std::string memoryOrderScopeStore = - (MemoryOrder() == MEMORY_ORDER_ACQUIRE) - ? (", memory_order_release" + MemoryScopeStr()) - : memoryOrderScopeLoad; - std::string postfix(memoryOrderScopeLoad.empty() ? "" : "_explicit"); - return " atomic_store" + postfix + "(&destMemory[tid], tid" - + memoryOrderScopeStore - + ");\n" - " oldValues[tid] = atomic_load" - + postfix + "(&destMemory[tid]" + memoryOrderScopeLoad + ");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - host_atomic_store(&destMemory[tid], (HostDataType)tid, MEMORY_ORDER_SEQ_CST); - oldValues[tid] = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[tid], MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = (HostDataType)whichDestValue; - return true; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - for(cl_uint i = 0; i < threadCount; i++ ) - { - if(refValues[i] != (HostDataType)i) - { - log_error("Invalid value for thread %u\n", (cl_uint)i); - correct = false; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; + using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; + CBasicTestLoad(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + OldValueCheck(false); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return threadCount; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (MemoryOrder() == MEMORY_ORDER_RELEASE + || MemoryOrder() == MEMORY_ORDER_ACQ_REL) + return 0; // skip test - not applicable + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + virtual std::string ProgramCore() + { + // In the case this test is run with MEMORY_ORDER_ACQUIRE, the store + // should be MEMORY_ORDER_RELEASE + std::string memoryOrderScopeLoad = MemoryOrderScopeStr(); + std::string memoryOrderScopeStore = + (MemoryOrder() == MEMORY_ORDER_ACQUIRE) + ? (", memory_order_release" + MemoryScopeStr()) + : memoryOrderScopeLoad; + std::string postfix(memoryOrderScopeLoad.empty() ? "" : "_explicit"); + return " atomic_store" + postfix + "(&destMemory[tid], tid" + + memoryOrderScopeStore + + ");\n" + " oldValues[tid] = atomic_load" + + postfix + "(&destMemory[tid]" + memoryOrderScopeLoad + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + host_atomic_store(&destMemory[tid], (HostDataType)tid, + MEMORY_ORDER_SEQ_CST); + oldValues[tid] = host_atomic_load<HostAtomicType, HostDataType>( + &destMemory[tid], MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = (HostDataType)whichDestValue; + return true; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + for (cl_uint i = 0; i < threadCount; i++) + { + if (refValues[i] != (HostDataType)i) + { + log_error("Invalid value for thread %u\n", (cl_uint)i); + correct = false; + return true; + } + } return true; - } } - return true; - } }; -int test_atomic_load_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_load_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestLoad<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, useSVM); - EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(TYPE_ATOMIC_DOUBLE, useSVM); - EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestLoad<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestLoad<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestLoad<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestLoad<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, + useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, + useSVM); + EXECUTE_TEST(error, + test_float.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double( + TYPE_ATOMIC_DOUBLE, useSVM); + EXECUTE_TEST(error, + test_double.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestLoad<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestLoad<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestLoad<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_load(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_load(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_load_generic(deviceID, context, queue, num_elements, false); + return test_atomic_load_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_load(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_load(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_load_generic(deviceID, context, queue, num_elements, true); + return test_atomic_load_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestExchange : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestExchange + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; - CBasicTestExchange(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(123456); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " oldValues[tid] = atomic_exchange"+postfix+"(&destMemory[0], tid"+memoryOrderScope+");\n" - " for(int i = 0; i < "+IterationsStr()+"; i++)\n" - " oldValues[tid] = atomic_exchange"+postfix+"(&destMemory[0], oldValues[tid]"+memoryOrderScope+");\n"; - } - - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - oldValues[tid] = host_atomic_exchange(&destMemory[0], (HostDataType)tid, MemoryOrder()); - for(int i = 0; i < Iterations(); i++) - oldValues[tid] = host_atomic_exchange(&destMemory[0], oldValues[tid], MemoryOrder()); - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - OldValueCheck(Iterations()%2 == 0); //check is valid for even number of iterations only - correct = true; - /* We are expecting values from 0 to size-1 and initial value from atomic variable */ - /* These values must be distributed across refValues array and atomic variable finalVaue[0] */ - /* Any repeated value is treated as an error */ - std::vector<bool> tidFound(threadCount); - bool startValueFound = false; - cl_uint i; - - for(i = 0; i <= threadCount; i++) - { - cl_uint value; - if(i == threadCount) - value = (cl_uint)finalValues[0]; //additional value from atomic variable (last written) - else - value = (cl_uint)refValues[i]; - if(value == (cl_uint)StartValue()) - { - // Special initial value - if(startValueFound) + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; + CBasicTestExchange(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(123456); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " oldValues[tid] = atomic_exchange" + postfix + + "(&destMemory[0], tid" + memoryOrderScope + + ");\n" + " for(int i = 0; i < " + + IterationsStr() + + "; i++)\n" + " oldValues[tid] = atomic_exchange" + + postfix + "(&destMemory[0], oldValues[tid]" + memoryOrderScope + + ");\n"; + } + + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + oldValues[tid] = host_atomic_exchange(&destMemory[0], (HostDataType)tid, + MemoryOrder()); + for (int i = 0; i < Iterations(); i++) + oldValues[tid] = host_atomic_exchange( + &destMemory[0], oldValues[tid], MemoryOrder()); + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + OldValueCheck( + Iterations() % 2 + == 0); // check is valid for even number of iterations only + correct = true; + /* We are expecting values from 0 to size-1 and initial value from + * atomic variable */ + /* These values must be distributed across refValues array and atomic + * variable finalVaue[0] */ + /* Any repeated value is treated as an error */ + std::vector<bool> tidFound(threadCount); + bool startValueFound = false; + cl_uint i; + + for (i = 0; i <= threadCount; i++) { - log_error("ERROR: Starting reference value (%u) occurred more thane once\n", (cl_uint)StartValue()); - correct = false; - return true; + cl_uint value; + if (i == threadCount) + value = (cl_uint)finalValues[0]; // additional value from atomic + // variable (last written) + else + value = (cl_uint)refValues[i]; + if (value == (cl_uint)StartValue()) + { + // Special initial value + if (startValueFound) + { + log_error("ERROR: Starting reference value (%u) occurred " + "more thane once\n", + (cl_uint)StartValue()); + correct = false; + return true; + } + startValueFound = true; + continue; + } + if (value >= threadCount) + { + log_error( + "ERROR: Reference value %u outside of valid range! (%u)\n", + i, value); + correct = false; + return true; + } + if (tidFound[value]) + { + log_error("ERROR: Value (%u) occurred more thane once\n", + value); + correct = false; + return true; + } + tidFound[value] = true; } - startValueFound = true; - continue; - } - if(value >= threadCount) - { - log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value); - correct = false; - return true; - } - if(tidFound[value]) - { - log_error("ERROR: Value (%u) occurred more thane once\n", value); - correct = false; return true; - } - tidFound[value] = true; } - return true; - } }; -int test_atomic_exchange_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_exchange_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestExchange<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(TYPE_ATOMIC_FLOAT, useSVM); - EXECUTE_TEST(error, test_float.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(TYPE_ATOMIC_DOUBLE, useSVM); - EXECUTE_TEST(error, test_double.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestExchange<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestExchange<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestExchange<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float( + TYPE_ATOMIC_FLOAT, useSVM); + EXECUTE_TEST(error, + test_float.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double( + TYPE_ATOMIC_DOUBLE, useSVM); + EXECUTE_TEST(error, + test_double.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestExchange<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestExchange<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestExchange<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_exchange(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_exchange(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_exchange_generic(deviceID, context, queue, num_elements, false); + return test_atomic_exchange_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_exchange(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_exchange(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_exchange_generic(deviceID, context, queue, num_elements, true); + return test_atomic_exchange_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestCompareStrong : public CBasicTestMemOrder2Scope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestCompareStrong + : public CBasicTestMemOrder2Scope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder2; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrderScope; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryScope; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::Iterations; - using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::IterationsStr; - using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; - CBasicTestCompareStrong(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(123456); - OldValueCheck(false); - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(MemoryOrder2() == MEMORY_ORDER_RELEASE || - MemoryOrder2() == MEMORY_ORDER_ACQ_REL) - return 0; // not allowed as 'failure' argument - if((MemoryOrder() == MEMORY_ORDER_RELAXED && MemoryOrder2() != MEMORY_ORDER_RELAXED) || - (MemoryOrder() != MEMORY_ORDER_SEQ_CST && MemoryOrder2() == MEMORY_ORDER_SEQ_CST)) - return 0; // failure argument shall be no stronger than the success - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - if (CheckCapabilities(MemoryScope(), MemoryOrder2()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - return CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScope(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" ")+DataType().RegularTypeName()+" expected, previous;\n" - " int successCount = 0;\n" - " oldValues[tid] = tid;\n" - " expected = tid; // force failure at the beginning\n" - " if(atomic_compare_exchange_strong"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+") || expected == tid)\n" - " oldValues[tid] = threadCount+1; //mark unexpected success with invalid value\n" - " else\n" - " {\n" - " for(int i = 0; i < "+IterationsStr()+" || successCount == 0; i++)\n" - " {\n" - " previous = expected;\n" - " if(atomic_compare_exchange_strong"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+"))\n" - " {\n" - " oldValues[tid] = expected;\n" - " successCount++;\n" - " }\n" - " else\n" - " {\n" - " if(previous == expected) // spurious failure - shouldn't occur for 'strong'\n" - " {\n" - " oldValues[tid] = threadCount; //mark fail with invalid value\n" - " break;\n" - " }\n" - " }\n" - " }\n" - " }\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - HostDataType expected = (HostDataType)StartValue(), previous; - oldValues[tid] = (HostDataType)tid; - for(int i = 0; i < Iterations(); i++) - { - previous = expected; - if(host_atomic_compare_exchange(&destMemory[0], &expected, oldValues[tid], MemoryOrder(), MemoryOrder2())) - oldValues[tid] = expected; - else - { - if(previous == expected) // shouldn't occur for 'strong' + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryOrder2; + using CBasicTestMemOrder2Scope<HostAtomicType, + HostDataType>::MemoryOrderScope; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::MemoryScope; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::Iterations; + using CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>::IterationsStr; + using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities; + CBasicTestCompareStrong(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrder2Scope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(123456); + OldValueCheck(false); + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (MemoryOrder2() == MEMORY_ORDER_RELEASE + || MemoryOrder2() == MEMORY_ORDER_ACQ_REL) + return 0; // not allowed as 'failure' argument + if ((MemoryOrder() == MEMORY_ORDER_RELAXED + && MemoryOrder2() != MEMORY_ORDER_RELAXED) + || (MemoryOrder() != MEMORY_ORDER_SEQ_CST + && MemoryOrder2() == MEMORY_ORDER_SEQ_CST)) + return 0; // failure argument shall be no stronger than the success + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + if (CheckCapabilities(MemoryScope(), MemoryOrder2()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + return CBasicTestMemOrder2Scope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScope(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" ") + DataType().RegularTypeName() + + " expected, previous;\n" + " int successCount = 0;\n" + " oldValues[tid] = tid;\n" + " expected = tid; // force failure at the beginning\n" + " if(atomic_compare_exchange_strong" + + postfix + "(&destMemory[0], &expected, oldValues[tid]" + + memoryOrderScope + + ") || expected == tid)\n" + " oldValues[tid] = threadCount+1; //mark unexpected success " + "with invalid value\n" + " else\n" + " {\n" + " for(int i = 0; i < " + + IterationsStr() + + " || successCount == 0; i++)\n" + " {\n" + " previous = expected;\n" + " if(atomic_compare_exchange_strong" + + postfix + "(&destMemory[0], &expected, oldValues[tid]" + + memoryOrderScope + + "))\n" + " {\n" + " oldValues[tid] = expected;\n" + " successCount++;\n" + " }\n" + " else\n" + " {\n" + " if(previous == expected) // spurious failure - " + "shouldn't occur for 'strong'\n" + " {\n" + " oldValues[tid] = threadCount; //mark fail with " + "invalid value\n" + " break;\n" + " }\n" + " }\n" + " }\n" + " }\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + HostDataType expected = (HostDataType)StartValue(), previous; + oldValues[tid] = (HostDataType)tid; + for (int i = 0; i < Iterations(); i++) { - oldValues[tid] = threadCount; //mark fail with invalid value + previous = expected; + if (host_atomic_compare_exchange(&destMemory[0], &expected, + oldValues[tid], MemoryOrder(), + MemoryOrder2())) + oldValues[tid] = expected; + else + { + if (previous == expected) // shouldn't occur for 'strong' + { + oldValues[tid] = threadCount; // mark fail with invalid + // value + } + } } - } - } - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - /* We are expecting values from 0 to size-1 and initial value from atomic variable */ - /* These values must be distributed across refValues array and atomic variable finalVaue[0] */ - /* Any repeated value is treated as an error */ - std::vector<bool> tidFound(threadCount); - bool startValueFound = false; - cl_uint i; - - for(i = 0; i <= threadCount; i++) - { - cl_uint value; - if(i == threadCount) - value = (cl_uint)finalValues[0]; //additional value from atomic variable (last written) - else - value = (cl_uint)refValues[i]; - if(value == (cl_uint)StartValue()) - { - // Special initial value - if(startValueFound) + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + /* We are expecting values from 0 to size-1 and initial value from + * atomic variable */ + /* These values must be distributed across refValues array and atomic + * variable finalVaue[0] */ + /* Any repeated value is treated as an error */ + std::vector<bool> tidFound(threadCount); + bool startValueFound = false; + cl_uint i; + + for (i = 0; i <= threadCount; i++) { - log_error("ERROR: Starting reference value (%u) occurred more thane once\n", (cl_uint)StartValue()); - correct = false; - return true; + cl_uint value; + if (i == threadCount) + value = (cl_uint)finalValues[0]; // additional value from atomic + // variable (last written) + else + value = (cl_uint)refValues[i]; + if (value == (cl_uint)StartValue()) + { + // Special initial value + if (startValueFound) + { + log_error("ERROR: Starting reference value (%u) occurred " + "more thane once\n", + (cl_uint)StartValue()); + correct = false; + return true; + } + startValueFound = true; + continue; + } + if (value >= threadCount) + { + if (value == threadCount) + log_error("ERROR: Spurious failure detected for " + "atomic_compare_exchange_strong\n"); + log_error( + "ERROR: Reference value %u outside of valid range! (%u)\n", + i, value); + correct = false; + return true; + } + if (tidFound[value]) + { + log_error("ERROR: Value (%u) occurred more thane once\n", + value); + correct = false; + return true; + } + tidFound[value] = true; } - startValueFound = true; - continue; - } - if(value >= threadCount) - { - if(value == threadCount) - log_error("ERROR: Spurious failure detected for atomic_compare_exchange_strong\n"); - log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value); - correct = false; return true; - } - if(tidFound[value]) - { - log_error("ERROR: Value (%u) occurred more thane once\n", value); - correct = false; - return true; - } - tidFound[value] = true; } - return true; - } }; -int test_atomic_compare_exchange_strong_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_compare_exchange_strong_generic(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements, bool useSVM) { - int error = 0; - CBasicTestCompareStrong<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestCompareStrong<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_UINT, HOST_UINT> test_uint( + TYPE_ATOMIC_UINT, useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_LONG, HOST_LONG> test_long( + TYPE_ATOMIC_LONG, useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> + test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestCompareStrong<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> + test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareStrong<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_compare_exchange_strong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_compare_exchange_strong(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { - return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, num_elements, false); + return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, + num_elements, false); } -int test_svm_atomic_compare_exchange_strong(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_compare_exchange_strong(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { - return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, num_elements, true); + return test_atomic_compare_exchange_strong_generic(deviceID, context, queue, + num_elements, true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestCompareWeak : public CBasicTestCompareStrong<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestCompareWeak + : public CBasicTestCompareStrong<HostAtomicType, HostDataType> { public: - using CBasicTestCompareStrong<HostAtomicType, HostDataType>::StartValue; - using CBasicTestCompareStrong<HostAtomicType, HostDataType>::MemoryOrderScope; - using CBasicTestCompareStrong<HostAtomicType, HostDataType>::DataType; - using CBasicTestCompareStrong<HostAtomicType, HostDataType>::Iterations; - using CBasicTestCompareStrong<HostAtomicType, HostDataType>::IterationsStr; - CBasicTestCompareWeak(TExplicitAtomicType dataType, bool useSVM) : CBasicTestCompareStrong<HostAtomicType, HostDataType>(dataType, useSVM) - { - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScope(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" ")+DataType().RegularTypeName()+" expected , previous;\n" - " int successCount = 0;\n" - " oldValues[tid] = tid;\n" - " expected = tid; // force failure at the beginning\n" - " if(atomic_compare_exchange_weak"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+") || expected == tid)\n" - " oldValues[tid] = threadCount+1; //mark unexpected success with invalid value\n" - " else\n" - " {\n" - " for(int i = 0; i < "+IterationsStr()+" || successCount == 0; i++)\n" - " {\n" - " previous = expected;\n" - " if(atomic_compare_exchange_weak"+postfix+"(&destMemory[0], &expected, oldValues[tid]"+memoryOrderScope+"))\n" - " {\n" - " oldValues[tid] = expected;\n" - " successCount++;\n" - " }\n" - " }\n" - " }\n"; - } + using CBasicTestCompareStrong<HostAtomicType, HostDataType>::StartValue; + using CBasicTestCompareStrong<HostAtomicType, + HostDataType>::MemoryOrderScope; + using CBasicTestCompareStrong<HostAtomicType, HostDataType>::DataType; + using CBasicTestCompareStrong<HostAtomicType, HostDataType>::Iterations; + using CBasicTestCompareStrong<HostAtomicType, HostDataType>::IterationsStr; + CBasicTestCompareWeak(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestCompareStrong<HostAtomicType, HostDataType>(dataType, + useSVM) + {} + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScope(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" ") + DataType().RegularTypeName() + + " expected , previous;\n" + " int successCount = 0;\n" + " oldValues[tid] = tid;\n" + " expected = tid; // force failure at the beginning\n" + " if(atomic_compare_exchange_weak" + + postfix + "(&destMemory[0], &expected, oldValues[tid]" + + memoryOrderScope + + ") || expected == tid)\n" + " oldValues[tid] = threadCount+1; //mark unexpected success " + "with invalid value\n" + " else\n" + " {\n" + " for(int i = 0; i < " + + IterationsStr() + + " || successCount == 0; i++)\n" + " {\n" + " previous = expected;\n" + " if(atomic_compare_exchange_weak" + + postfix + "(&destMemory[0], &expected, oldValues[tid]" + + memoryOrderScope + + "))\n" + " {\n" + " oldValues[tid] = expected;\n" + " successCount++;\n" + " }\n" + " }\n" + " }\n"; + } }; -int test_atomic_compare_exchange_weak_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_compare_exchange_weak_generic(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements, bool useSVM) { - int error = 0; - CBasicTestCompareWeak<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestCompareWeak<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_UINT, HOST_UINT> test_uint( + TYPE_ATOMIC_UINT, useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_LONG, HOST_LONG> test_long( + TYPE_ATOMIC_LONG, useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestCompareWeak<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestCompareWeak<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, num_elements, false); + return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, + num_elements, false); } -int test_svm_atomic_compare_exchange_weak(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_compare_exchange_weak(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { - return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, num_elements, true); + return test_atomic_compare_exchange_weak_generic(deviceID, context, queue, + num_elements, true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchAdd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchAdd + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " oldValues[tid] = atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n"+ - " atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n" - " atomic_fetch_add"+postfix+"(&destMemory[0], ("+DataType().AddSubOperandTypeName()+")tid + 3"+memoryOrderScope+");\n" - " atomic_fetch_add"+postfix+"(&destMemory[0], (("+DataType().AddSubOperandTypeName()+")tid + 3) << (sizeof("+DataType().AddSubOperandTypeName()+")-1)*8"+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - oldValues[tid] = host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder()); - host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder()); - host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, MemoryOrder()); - host_atomic_fetch_add(&destMemory[0], ((HostDataType)tid + 3) << (sizeof(HostDataType)-1)*8, MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = StartValue(); - for(cl_uint i = 0; i < threadCount; i++) - expected += ((HostDataType)i+3)*3+(((HostDataType)i + 3) << (sizeof(HostDataType)-1)*8); - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + {} + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " oldValues[tid] = atomic_fetch_add" + postfix + + "(&destMemory[0], (" + DataType().AddSubOperandTypeName() + + ")tid + 3" + memoryOrderScope + ");\n" + " atomic_fetch_add" + + postfix + "(&destMemory[0], (" + + DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope + + ");\n" + " atomic_fetch_add" + + postfix + "(&destMemory[0], (" + + DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope + + ");\n" + " atomic_fetch_add" + + postfix + "(&destMemory[0], ((" + + DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof(" + + DataType().AddSubOperandTypeName() + ")-1)*8" + memoryOrderScope + + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + oldValues[tid] = host_atomic_fetch_add( + &destMemory[0], (HostDataType)tid + 3, MemoryOrder()); + host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, + MemoryOrder()); + host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, + MemoryOrder()); + host_atomic_fetch_add(&destMemory[0], + ((HostDataType)tid + 3) + << (sizeof(HostDataType) - 1) * 8, + MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = StartValue(); + for (cl_uint i = 0; i < threadCount; i++) + expected += ((HostDataType)i + 3) * 3 + + (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8); + return true; + } }; -int test_atomic_fetch_add_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_add_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchAdd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchAdd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_add(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_add(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_add_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchSub : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchSub + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " oldValues[tid] = atomic_fetch_sub"+postfix+"(&destMemory[0], tid + 3 +((("+DataType().AddSubOperandTypeName()+")tid + 3) << (sizeof("+DataType().AddSubOperandTypeName()+")-1)*8)"+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - oldValues[tid] = host_atomic_fetch_sub(&destMemory[0], (HostDataType)tid + 3+(((HostDataType)tid + 3) << (sizeof(HostDataType)-1)*8), MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = StartValue(); - for(cl_uint i = 0; i < threadCount; i++) - expected -= (HostDataType)i + 3 +(((HostDataType)i + 3) << (sizeof(HostDataType)-1)*8); - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + {} + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " oldValues[tid] = atomic_fetch_sub" + postfix + + "(&destMemory[0], tid + 3 +(((" + + DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof(" + + DataType().AddSubOperandTypeName() + ")-1)*8)" + memoryOrderScope + + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + oldValues[tid] = host_atomic_fetch_sub( + &destMemory[0], + (HostDataType)tid + 3 + + (((HostDataType)tid + 3) << (sizeof(HostDataType) - 1) * 8), + MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = StartValue(); + for (cl_uint i = 0; i < threadCount; i++) + expected -= (HostDataType)i + 3 + + (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8); + return true; + } }; -int test_atomic_fetch_sub_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_sub_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchSub<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchSub<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchSub<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_sub(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_sub(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_sub_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchOr : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchOr + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - CBasicTestFetchOr(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(0); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - cl_uint numBits = DataType().Size(deviceID) * 8; - - return (threadCount + numBits - 1) / numBits; - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" size_t numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n" - " int whichResult = tid / numBits;\n" - " int bitIndex = tid - (whichResult * numBits);\n" - "\n" - " oldValues[tid] = atomic_fetch_or"+postfix+"(&destMemory[whichResult], (("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - size_t numBits = sizeof(HostDataType) * 8; - size_t whichResult = tid / numBits; - size_t bitIndex = tid - (whichResult * numBits); - - oldValues[tid] = host_atomic_fetch_or(&destMemory[whichResult], ((HostDataType)1 << bitIndex), MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - cl_uint numValues = (threadCount + (sizeof(HostDataType)*8-1)) / (sizeof(HostDataType)*8); - if(whichDestValue < numValues - 1) - { - expected = ~(HostDataType)0; - return true; - } - // Last item doesn't get or'ed on every bit, so we have to mask away - cl_uint numBits = threadCount - whichDestValue * (sizeof(HostDataType)*8); - expected = StartValue(); - for(cl_uint i = 0; i < numBits; i++) - expected |= ((HostDataType)1 << i); - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + CBasicTestFetchOr(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(0); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + cl_uint numBits = DataType().Size(deviceID) * 8; + + return (threadCount + numBits - 1) / numBits; + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" size_t numBits = sizeof(") + + DataType().RegularTypeName() + + ") * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - (whichResult * numBits);\n" + "\n" + " oldValues[tid] = atomic_fetch_or" + + postfix + "(&destMemory[whichResult], ((" + + DataType().RegularTypeName() + ")1 << bitIndex) " + + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + size_t numBits = sizeof(HostDataType) * 8; + size_t whichResult = tid / numBits; + size_t bitIndex = tid - (whichResult * numBits); + + oldValues[tid] = + host_atomic_fetch_or(&destMemory[whichResult], + ((HostDataType)1 << bitIndex), MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + cl_uint numValues = (threadCount + (sizeof(HostDataType) * 8 - 1)) + / (sizeof(HostDataType) * 8); + if (whichDestValue < numValues - 1) + { + expected = ~(HostDataType)0; + return true; + } + // Last item doesn't get or'ed on every bit, so we have to mask away + cl_uint numBits = + threadCount - whichDestValue * (sizeof(HostDataType) * 8); + expected = StartValue(); + for (cl_uint i = 0; i < numBits; i++) + expected |= ((HostDataType)1 << i); + return true; + } }; -int test_atomic_fetch_or_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_or_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchOr<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchOr<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchOr<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOr<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_or(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_or(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_or_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchXor : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchXor + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - CBasicTestFetchXor(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue((HostDataType)0x2f08ab418ba0541LL); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" int numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n" - " int bitIndex = (numBits-1)*(tid+1)/threadCount;\n" - "\n" - " oldValues[tid] = atomic_fetch_xor"+postfix+"(&destMemory[0], (("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - int numBits = sizeof(HostDataType) * 8; - int bitIndex = (numBits-1)*(tid+1)/threadCount; - - oldValues[tid] = host_atomic_fetch_xor(&destMemory[0], ((HostDataType)1 << bitIndex), MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - int numBits = sizeof(HostDataType)*8; - expected = StartValue(); - for(cl_uint i = 0; i < threadCount; i++) - { - int bitIndex = (numBits-1)*(i+1)/threadCount; - expected ^= ((HostDataType)1 << bitIndex); - } - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + CBasicTestFetchXor(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue((HostDataType)0x2f08ab418ba0541LL); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" int numBits = sizeof(") + + DataType().RegularTypeName() + + ") * 8;\n" + " int bitIndex = (numBits-1)*(tid+1)/threadCount;\n" + "\n" + " oldValues[tid] = atomic_fetch_xor" + + postfix + "(&destMemory[0], ((" + DataType().RegularTypeName() + + ")1 << bitIndex) " + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + int numBits = sizeof(HostDataType) * 8; + int bitIndex = (numBits - 1) * (tid + 1) / threadCount; + + oldValues[tid] = host_atomic_fetch_xor( + &destMemory[0], ((HostDataType)1 << bitIndex), MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + int numBits = sizeof(HostDataType) * 8; + expected = StartValue(); + for (cl_uint i = 0; i < threadCount; i++) + { + int bitIndex = (numBits - 1) * (i + 1) / threadCount; + expected ^= ((HostDataType)1 << bitIndex); + } + return true; + } }; -int test_atomic_fetch_xor_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_xor_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchXor<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchXor<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchXor<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_xor(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_xor(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_xor_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchAnd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchAnd + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - CBasicTestFetchAnd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(~(HostDataType)0); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - cl_uint numBits = DataType().Size(deviceID) * 8; - - return (threadCount + numBits - 1) / numBits; - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" size_t numBits = sizeof(")+DataType().RegularTypeName()+") * 8;\n" - " int whichResult = tid / numBits;\n" - " int bitIndex = tid - (whichResult * numBits);\n" - "\n" - " oldValues[tid] = atomic_fetch_and"+postfix+"(&destMemory[whichResult], ~(("+DataType().RegularTypeName()+")1 << bitIndex) "+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - size_t numBits = sizeof(HostDataType) * 8; - size_t whichResult = tid / numBits; - size_t bitIndex = tid - (whichResult * numBits); - - oldValues[tid] = host_atomic_fetch_and(&destMemory[whichResult], ~((HostDataType)1 << bitIndex), MemoryOrder()); - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - cl_uint numValues = (threadCount + (sizeof(HostDataType)*8-1)) / (sizeof(HostDataType)*8); - if(whichDestValue < numValues - 1) - { - expected = 0; - return true; - } - // Last item doesn't get and'ed on every bit, so we have to mask away - size_t numBits = threadCount - whichDestValue * (sizeof(HostDataType)*8); - expected = StartValue(); - for(size_t i = 0; i < numBits; i++) - expected &= ~((HostDataType)1 << i); - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + CBasicTestFetchAnd(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(~(HostDataType)0); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + cl_uint numBits = DataType().Size(deviceID) * 8; + + return (threadCount + numBits - 1) / numBits; + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" size_t numBits = sizeof(") + + DataType().RegularTypeName() + + ") * 8;\n" + " int whichResult = tid / numBits;\n" + " int bitIndex = tid - (whichResult * numBits);\n" + "\n" + " oldValues[tid] = atomic_fetch_and" + + postfix + "(&destMemory[whichResult], ~((" + + DataType().RegularTypeName() + ")1 << bitIndex) " + + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + size_t numBits = sizeof(HostDataType) * 8; + size_t whichResult = tid / numBits; + size_t bitIndex = tid - (whichResult * numBits); + + oldValues[tid] = host_atomic_fetch_and(&destMemory[whichResult], + ~((HostDataType)1 << bitIndex), + MemoryOrder()); + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + cl_uint numValues = (threadCount + (sizeof(HostDataType) * 8 - 1)) + / (sizeof(HostDataType) * 8); + if (whichDestValue < numValues - 1) + { + expected = 0; + return true; + } + // Last item doesn't get and'ed on every bit, so we have to mask away + size_t numBits = + threadCount - whichDestValue * (sizeof(HostDataType) * 8); + expected = StartValue(); + for (size_t i = 0; i < numBits; i++) + expected &= ~((HostDataType)1 << i); + return true; + } }; -int test_atomic_fetch_and_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_and_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_and(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_and(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_and_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchOrAnd : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchOrAnd + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; - CBasicTestFetchOrAnd(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(0); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return 1+(threadCount-1)/(DataType().Size(deviceID)*8); - } - // each thread modifies (with OR and AND operations) and verifies - // only one bit in atomic variable - // other bits are modified by other threads but it must not affect current thread operation - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" int bits = sizeof(")+DataType().RegularTypeName()+")*8;\n"+ - " size_t valueInd = tid/bits;\n" - " "+DataType().RegularTypeName()+" value, bitMask = ("+DataType().RegularTypeName()+")1 << tid%bits;\n" - " oldValues[tid] = 0;\n" - " for(int i = 0; i < "+IterationsStr()+"; i++)\n" - " {\n" - " value = atomic_fetch_or"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n" - " if(value & bitMask) // bit should be set to 0\n" - " oldValues[tid]++;\n" - " value = atomic_fetch_and"+postfix+"(destMemory+valueInd, ~bitMask"+memoryOrderScope+");\n" - " if(!(value & bitMask)) // bit should be set to 1\n" - " oldValues[tid]++;\n" - " }\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - int bits = sizeof(HostDataType)*8; - size_t valueInd = tid/bits; - HostDataType value, bitMask = (HostDataType)1 << tid%bits; - oldValues[tid] = 0; - for(int i = 0; i < Iterations(); i++) - { - value = host_atomic_fetch_or(destMemory+valueInd, bitMask, MemoryOrder()); - if(value & bitMask) // bit should be set to 0 - oldValues[tid]++; - value = host_atomic_fetch_and(destMemory+valueInd, ~bitMask, MemoryOrder()); - if(!(value & bitMask)) // bit should be set to 1 - oldValues[tid]++; - } - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = 0; - return true; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - for(cl_uint i = 0; i < threadCount; i++) - { - if(refValues[i] > 0) - { - log_error("Thread %d found %d mismatch(es)\n", i, (cl_uint)refValues[i]); - correct = false; - } - } - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; + CBasicTestFetchOrAnd(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(0); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return 1 + (threadCount - 1) / (DataType().Size(deviceID) * 8); + } + // each thread modifies (with OR and AND operations) and verifies + // only one bit in atomic variable + // other bits are modified by other threads but it must not affect current + // thread operation + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" int bits = sizeof(") + + DataType().RegularTypeName() + ")*8;\n" + + " size_t valueInd = tid/bits;\n" + " " + + DataType().RegularTypeName() + " value, bitMask = (" + + DataType().RegularTypeName() + + ")1 << tid%bits;\n" + " oldValues[tid] = 0;\n" + " for(int i = 0; i < " + + IterationsStr() + + "; i++)\n" + " {\n" + " value = atomic_fetch_or" + + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope + + ");\n" + " if(value & bitMask) // bit should be set to 0\n" + " oldValues[tid]++;\n" + " value = atomic_fetch_and" + + postfix + "(destMemory+valueInd, ~bitMask" + memoryOrderScope + + ");\n" + " if(!(value & bitMask)) // bit should be set to 1\n" + " oldValues[tid]++;\n" + " }\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + int bits = sizeof(HostDataType) * 8; + size_t valueInd = tid / bits; + HostDataType value, bitMask = (HostDataType)1 << tid % bits; + oldValues[tid] = 0; + for (int i = 0; i < Iterations(); i++) + { + value = host_atomic_fetch_or(destMemory + valueInd, bitMask, + MemoryOrder()); + if (value & bitMask) // bit should be set to 0 + oldValues[tid]++; + value = host_atomic_fetch_and(destMemory + valueInd, ~bitMask, + MemoryOrder()); + if (!(value & bitMask)) // bit should be set to 1 + oldValues[tid]++; + } + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = 0; + return true; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + for (cl_uint i = 0; i < threadCount; i++) + { + if (refValues[i] > 0) + { + log_error("Thread %d found %d mismatch(es)\n", i, + (cl_uint)refValues[i]); + correct = false; + } + } + return true; + } }; -int test_atomic_fetch_orand_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_orand_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchOrAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchOrAnd<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_UINT, HOST_UINT> test_uint( + TYPE_ATOMIC_UINT, useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_LONG, HOST_LONG> test_long( + TYPE_ATOMIC_LONG, useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchOrAnd<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchOrAnd<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_orand(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_orand(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_orand_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_orand_generic(deviceID, context, queue, + num_elements, false); } -int test_svm_atomic_fetch_orand(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_orand(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_orand_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_orand_generic(deviceID, context, queue, + num_elements, true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchXor2 : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchXor2 + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; - CBasicTestFetchXor2(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(0); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return 1+(threadCount-1)/(DataType().Size(deviceID)*8); - } - // each thread modifies (with XOR operation) and verifies - // only one bit in atomic variable - // other bits are modified by other threads but it must not affect current thread operation - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - std::string(" int bits = sizeof(")+DataType().RegularTypeName()+")*8;\n"+ - " size_t valueInd = tid/bits;\n" - " "+DataType().RegularTypeName()+" value, bitMask = ("+DataType().RegularTypeName()+")1 << tid%bits;\n" - " oldValues[tid] = 0;\n" - " for(int i = 0; i < "+IterationsStr()+"; i++)\n" - " {\n" - " value = atomic_fetch_xor"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n" - " if(value & bitMask) // bit should be set to 0\n" - " oldValues[tid]++;\n" - " value = atomic_fetch_xor"+postfix+"(destMemory+valueInd, bitMask"+memoryOrderScope+");\n" - " if(!(value & bitMask)) // bit should be set to 1\n" - " oldValues[tid]++;\n" - " }\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - int bits = sizeof(HostDataType)*8; - size_t valueInd = tid/bits; - HostDataType value, bitMask = (HostDataType)1 << tid%bits; - oldValues[tid] = 0; - for(int i = 0; i < Iterations(); i++) - { - value = host_atomic_fetch_xor(destMemory+valueInd, bitMask, MemoryOrder()); - if(value & bitMask) // bit should be set to 0 - oldValues[tid]++; - value = host_atomic_fetch_xor(destMemory+valueInd, bitMask, MemoryOrder()); - if(!(value & bitMask)) // bit should be set to 1 - oldValues[tid]++; - } - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = 0; - return true; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - for(cl_uint i = 0; i < threadCount; i++) - { - if(refValues[i] > 0) - { - log_error("Thread %d found %d mismatches\n", i, (cl_uint)refValues[i]); - correct = false; - } - } - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::Iterations; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::IterationsStr; + CBasicTestFetchXor2(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(0); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return 1 + (threadCount - 1) / (DataType().Size(deviceID) * 8); + } + // each thread modifies (with XOR operation) and verifies + // only one bit in atomic variable + // other bits are modified by other threads but it must not affect current + // thread operation + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return std::string(" int bits = sizeof(") + + DataType().RegularTypeName() + ")*8;\n" + + " size_t valueInd = tid/bits;\n" + " " + + DataType().RegularTypeName() + " value, bitMask = (" + + DataType().RegularTypeName() + + ")1 << tid%bits;\n" + " oldValues[tid] = 0;\n" + " for(int i = 0; i < " + + IterationsStr() + + "; i++)\n" + " {\n" + " value = atomic_fetch_xor" + + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope + + ");\n" + " if(value & bitMask) // bit should be set to 0\n" + " oldValues[tid]++;\n" + " value = atomic_fetch_xor" + + postfix + "(destMemory+valueInd, bitMask" + memoryOrderScope + + ");\n" + " if(!(value & bitMask)) // bit should be set to 1\n" + " oldValues[tid]++;\n" + " }\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + int bits = sizeof(HostDataType) * 8; + size_t valueInd = tid / bits; + HostDataType value, bitMask = (HostDataType)1 << tid % bits; + oldValues[tid] = 0; + for (int i = 0; i < Iterations(); i++) + { + value = host_atomic_fetch_xor(destMemory + valueInd, bitMask, + MemoryOrder()); + if (value & bitMask) // bit should be set to 0 + oldValues[tid]++; + value = host_atomic_fetch_xor(destMemory + valueInd, bitMask, + MemoryOrder()); + if (!(value & bitMask)) // bit should be set to 1 + oldValues[tid]++; + } + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = 0; + return true; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + for (cl_uint i = 0; i < threadCount; i++) + { + if (refValues[i] > 0) + { + log_error("Thread %d found %d mismatches\n", i, + (cl_uint)refValues[i]); + correct = false; + } + } + return true; + } }; -int test_atomic_fetch_xor2_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_xor2_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchXor2<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchXor2<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchXor2<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchXor2<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_xor2_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_xor2_generic(deviceID, context, queue, + num_elements, false); } -int test_svm_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_xor2(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_xor2_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_xor2_generic(deviceID, context, queue, + num_elements, true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchMin : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchMin + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - CBasicTestFetchMin(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(DataType().MaxValue()); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " oldValues[tid] = atomic_fetch_min"+postfix+"(&destMemory[0], oldValues[tid] "+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - oldValues[tid] = host_atomic_fetch_min(&destMemory[0], oldValues[tid], MemoryOrder()); - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - for(cl_uint i = 0; i < threadCount; i++) - { - startRefValues[i] = genrand_int32(d); - if(sizeof(HostDataType) >= 8) - startRefValues[i] |= (HostDataType)genrand_int32(d) << 16; - } - return true; - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = StartValue(); - for(cl_uint i = 0; i < threadCount; i++) - { - if(startRefValues[ i ] < expected) - expected = startRefValues[ i ]; - } - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + CBasicTestFetchMin(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(DataType().MaxValue()); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " oldValues[tid] = atomic_fetch_min" + postfix + + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + oldValues[tid] = host_atomic_fetch_min(&destMemory[0], oldValues[tid], + MemoryOrder()); + } + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) + { + for (cl_uint i = 0; i < threadCount; i++) + { + startRefValues[i] = genrand_int32(d); + if (sizeof(HostDataType) >= 8) + startRefValues[i] |= (HostDataType)genrand_int32(d) << 16; + } + return true; + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = StartValue(); + for (cl_uint i = 0; i < threadCount; i++) + { + if (startRefValues[i] < expected) expected = startRefValues[i]; + } + return true; + } }; -int test_atomic_fetch_min_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_min_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchMin<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchMin<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchMin<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMin<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_min(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_min(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_min_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFetchMax : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFetchMax + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - CBasicTestFetchMax(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(DataType().MinValue()); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - return - " oldValues[tid] = atomic_fetch_max"+postfix+"(&destMemory[0], oldValues[tid] "+memoryOrderScope+");\n"; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - oldValues[tid] = host_atomic_fetch_max(&destMemory[0], oldValues[tid], MemoryOrder()); - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - for(cl_uint i = 0; i < threadCount; i++) - { - startRefValues[i] = genrand_int32(d); - if(sizeof(HostDataType) >= 8) - startRefValues[i] |= (HostDataType)genrand_int32(d) << 16; - } - return true; - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = StartValue(); - for(cl_uint i = 0; i < threadCount; i++) - { - if(startRefValues[ i ] > expected) - expected = startRefValues[ i ]; - } - return true; - } + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + CBasicTestFetchMax(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(DataType().MinValue()); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + return " oldValues[tid] = atomic_fetch_max" + postfix + + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n"; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + oldValues[tid] = host_atomic_fetch_max(&destMemory[0], oldValues[tid], + MemoryOrder()); + } + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) + { + for (cl_uint i = 0; i < threadCount; i++) + { + startRefValues[i] = genrand_int32(d); + if (sizeof(HostDataType) >= 8) + startRefValues[i] |= (HostDataType)genrand_int32(d) << 16; + } + return true; + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = StartValue(); + for (cl_uint i = 0; i < threadCount; i++) + { + if (startRefValues[i] > expected) expected = startRefValues[i]; + } + return true; + } }; -int test_atomic_fetch_max_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fetch_max_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFetchMax<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if(AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFetchMax<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong( + TYPE_ATOMIC_ULONG, useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFetchMax<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> + test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchMax<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fetch_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fetch_max(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fetch_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fetch_max(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fetch_max_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFlag : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ - static const HostDataType CRITICAL_SECTION_NOT_VISITED = 1000000000; +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFlag + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { + static const HostDataType CRITICAL_SECTION_NOT_VISITED = 1000000000; + public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; - CBasicTestFlag(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(0); - OldValueCheck(false); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return threadCount; - } - TExplicitMemoryOrderType MemoryOrderForClear() - { - // Memory ordering for atomic_flag_clear function - // ("shall not be memory_order_acquire nor memory_order_acq_rel") - if(MemoryOrder() == MEMORY_ORDER_ACQUIRE) - return MEMORY_ORDER_RELAXED; - if (MemoryOrder() == MEMORY_ORDER_ACQ_REL) - return MEMORY_ORDER_RELEASE; - return MemoryOrder(); - } - std::string MemoryOrderScopeStrForClear() - { - std::string orderStr; - if (MemoryOrder() != MEMORY_ORDER_EMPTY) - orderStr = std::string(", ") + get_memory_order_type_name(MemoryOrderForClear()); - return orderStr + MemoryScopeStr(); - } - - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, - cl_command_queue queue) - { - // This test assumes support for the memory_scope_device scope in the case - // that LocalMemory() == false. Therefore we should skip this test in that - // configuration on a 3.0 driver since supporting the memory_scope_device - // scope is optionaly. - if (get_device_cl_version(deviceID) >= Version{ 3, 0 }) - { - if (!LocalMemory() - && !(gAtomicFenceCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE)) - { - log_info( - "Skipping atomic_flag test due to use of atomic_scope_device " - "which is optionally not supported on this device\n"); - return 0; // skip test - not applicable - } - } - return CBasicTestMemOrderScope<HostAtomicType, - HostDataType>::ExecuteSingleTest(deviceID, - context, - queue); - } - virtual std::string ProgramCore() - { - std::string memoryOrderScope = MemoryOrderScopeStr(); - std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - std::string program = - " uint cnt, stop = 0;\n" - " for(cnt = 0; !stop && cnt < threadCount; cnt++) // each thread must find critical section where it is the first visitor\n" - " {\n" - " bool set = atomic_flag_test_and_set" + postfix + "(&destMemory[cnt]" + memoryOrderScope + ");\n"; - if (MemoryOrder() == MEMORY_ORDER_RELAXED || MemoryOrder() == MEMORY_ORDER_RELEASE) - program += " atomic_work_item_fence(" + - std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") + - "memory_order_acquire," + - std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) + - ");\n"; - - program += - " if (!set)\n" - " {\n"; - - if (LocalMemory()) - program += " uint csIndex = get_enqueued_local_size(0)*get_group_id(0)+cnt;\n"; - else - program += " uint csIndex = cnt;\n"; - - std::ostringstream csNotVisited; - csNotVisited << CRITICAL_SECTION_NOT_VISITED; - program += - " // verify that thread is the first visitor\n" - " if(oldValues[csIndex] == "+csNotVisited.str()+")\n" - " {\n" - " oldValues[csIndex] = tid; // set the winner id for this critical section\n" - " stop = 1;\n" - " }\n"; - - if (MemoryOrder() == MEMORY_ORDER_ACQUIRE || MemoryOrder() == MEMORY_ORDER_RELAXED) - program += " atomic_work_item_fence(" + - std::string(LocalMemory() ? "CLK_LOCAL_MEM_FENCE, " : "CLK_GLOBAL_MEM_FENCE, ") + - "memory_order_release," + - std::string(LocalMemory() ? "memory_scope_work_group" : (UseSVM() ? "memory_scope_all_svm_devices" : "memory_scope_device") ) + - ");\n"; - - program += - " atomic_flag_clear" + postfix + "(&destMemory[cnt]" + MemoryOrderScopeStrForClear() + ");\n" - " }\n" - " }\n"; - return program; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - cl_uint cnt, stop = 0; - for (cnt = 0; !stop && cnt < threadCount; cnt++) // each thread must find critical section where it is the first visitor\n" - { - if (!host_atomic_flag_test_and_set(&destMemory[cnt], MemoryOrder())) - { - cl_uint csIndex = cnt; - // verify that thread is the first visitor\n" - if (oldValues[csIndex] == CRITICAL_SECTION_NOT_VISITED) + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::MemoryOrderScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; + CBasicTestFlag(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) + { + StartValue(0); + OldValueCheck(false); + } + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) + { + return threadCount; + } + TExplicitMemoryOrderType MemoryOrderForClear() + { + // Memory ordering for atomic_flag_clear function + // ("shall not be memory_order_acquire nor memory_order_acq_rel") + if (MemoryOrder() == MEMORY_ORDER_ACQUIRE) return MEMORY_ORDER_RELAXED; + if (MemoryOrder() == MEMORY_ORDER_ACQ_REL) return MEMORY_ORDER_RELEASE; + return MemoryOrder(); + } + std::string MemoryOrderScopeStrForClear() + { + std::string orderStr; + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + orderStr = std::string(", ") + + get_memory_order_type_name(MemoryOrderForClear()); + return orderStr + MemoryScopeStr(); + } + + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + // This test assumes support for the memory_scope_device scope in the + // case that LocalMemory() == false. Therefore we should skip this test + // in that configuration on a 3.0 driver since supporting the + // memory_scope_device scope is optionaly. + if (get_device_cl_version(deviceID) >= Version{ 3, 0 }) { - oldValues[csIndex] = tid; // set the winner id for this critical section\n" - stop = 1; + if (!LocalMemory() + && !(gAtomicFenceCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE)) + { + log_info("Skipping atomic_flag test due to use of " + "atomic_scope_device " + "which is optionally not supported on this device\n"); + return 0; // skip test - not applicable + } } - host_atomic_flag_clear(&destMemory[cnt], MemoryOrderForClear()); - } - } - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - expected = StartValue(); - return true; - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - for(cl_uint i = 0 ; i < threadCount; i++) - startRefValues[i] = CRITICAL_SECTION_NOT_VISITED; - return true; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - /* We are expecting unique values from 0 to threadCount-1 (each critical section must be visited) */ - /* These values must be distributed across refValues array */ - std::vector<bool> tidFound(threadCount); - cl_uint i; - - for (i = 0; i < threadCount; i++) - { - cl_uint value = (cl_uint)refValues[i]; - if (value == CRITICAL_SECTION_NOT_VISITED) - { - // Special initial value - log_error("ERROR: Critical section %u not visited\n", i); - correct = false; + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + virtual std::string ProgramCore() + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + std::string program = + " uint cnt, stop = 0;\n" + " for(cnt = 0; !stop && cnt < threadCount; cnt++) // each thread " + "must find critical section where it is the first visitor\n" + " {\n" + " bool set = atomic_flag_test_and_set" + + postfix + "(&destMemory[cnt]" + memoryOrderScope + ");\n"; + if (MemoryOrder() == MEMORY_ORDER_RELAXED + || MemoryOrder() == MEMORY_ORDER_RELEASE || LocalMemory()) + program += " atomic_work_item_fence(" + + std::string( + LocalMemory() + ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, " + : "CLK_GLOBAL_MEM_FENCE, ") + + "memory_order_acquire," + + std::string(LocalMemory() + ? "memory_scope_work_group" + : (UseSVM() ? "memory_scope_all_svm_devices" + : "memory_scope_device")) + + ");\n"; + + program += " if (!set)\n" + " {\n"; + + if (LocalMemory()) + program += " uint csIndex = " + "get_enqueued_local_size(0)*get_group_id(0)+cnt;\n"; + else + program += " uint csIndex = cnt;\n"; + + std::ostringstream csNotVisited; + csNotVisited << CRITICAL_SECTION_NOT_VISITED; + program += " // verify that thread is the first visitor\n" + " if(oldValues[csIndex] == " + + csNotVisited.str() + + ")\n" + " {\n" + " oldValues[csIndex] = tid; // set the winner id for this " + "critical section\n" + " stop = 1;\n" + " }\n"; + + if (MemoryOrder() == MEMORY_ORDER_ACQUIRE + || MemoryOrder() == MEMORY_ORDER_RELAXED || LocalMemory()) + program += " atomic_work_item_fence(" + + std::string( + LocalMemory() + ? "CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE, " + : "CLK_GLOBAL_MEM_FENCE, ") + + "memory_order_release," + + std::string(LocalMemory() + ? "memory_scope_work_group" + : (UseSVM() ? "memory_scope_all_svm_devices" + : "memory_scope_device")) + + ");\n"; + + program += " atomic_flag_clear" + postfix + "(&destMemory[cnt]" + + MemoryOrderScopeStrForClear() + + ");\n" + " }\n" + " }\n"; + return program; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + cl_uint cnt, stop = 0; + for (cnt = 0; !stop && cnt < threadCount; + cnt++) // each thread must find critical section where it is the + // first visitor\n" + { + if (!host_atomic_flag_test_and_set(&destMemory[cnt], MemoryOrder())) + { + cl_uint csIndex = cnt; + // verify that thread is the first visitor\n" + if (oldValues[csIndex] == CRITICAL_SECTION_NOT_VISITED) + { + oldValues[csIndex] = + tid; // set the winner id for this critical section\n" + stop = 1; + } + host_atomic_flag_clear(&destMemory[cnt], MemoryOrderForClear()); + } + } + } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) + { + expected = StartValue(); return true; - } - if (value >= threadCount) - { - log_error("ERROR: Reference value %u outside of valid range! (%u)\n", i, value); - correct = false; + } + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) + { + for (cl_uint i = 0; i < threadCount; i++) + startRefValues[i] = CRITICAL_SECTION_NOT_VISITED; return true; - } - if (tidFound[value]) - { - log_error("ERROR: Value (%u) occurred more thane once\n", value); - correct = false; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + /* We are expecting unique values from 0 to threadCount-1 (each critical + * section must be visited) */ + /* These values must be distributed across refValues array */ + std::vector<bool> tidFound(threadCount); + cl_uint i; + + for (i = 0; i < threadCount; i++) + { + cl_uint value = (cl_uint)refValues[i]; + if (value == CRITICAL_SECTION_NOT_VISITED) + { + // Special initial value + log_error("ERROR: Critical section %u not visited\n", i); + correct = false; + return true; + } + if (value >= threadCount) + { + log_error( + "ERROR: Reference value %u outside of valid range! (%u)\n", + i, value); + correct = false; + return true; + } + if (tidFound[value]) + { + log_error("ERROR: Value (%u) occurred more thane once\n", + value); + correct = false; + return true; + } + tidFound[value] = true; + } return true; - } - tidFound[value] = true; } - return true; - } }; -int test_atomic_flag_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_flag_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFlag<HOST_ATOMIC_FLAG, HOST_FLAG> test_flag(TYPE_ATOMIC_FLAG, useSVM); - EXECUTE_TEST(error, test_flag.Execute(deviceID, context, queue, num_elements)); - return error; + int error = 0; + CBasicTestFlag<HOST_ATOMIC_FLAG, HOST_FLAG> test_flag(TYPE_ATOMIC_FLAG, + useSVM); + EXECUTE_TEST(error, + test_flag.Execute(deviceID, context, queue, num_elements)); + return error; } -int test_atomic_flag(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_flag(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_flag_generic(deviceID, context, queue, num_elements, false); + return test_atomic_flag_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_flag(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_flag(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_flag_generic(deviceID, context, queue, num_elements, true); + return test_atomic_flag_generic(deviceID, context, queue, num_elements, + true); } -template<typename HostAtomicType, typename HostDataType> -class CBasicTestFence : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> -{ - struct TestDefinition { - bool op1IsFence; - TExplicitMemoryOrderType op1MemOrder; - bool op2IsFence; - TExplicitMemoryOrderType op2MemOrder; - }; -public: - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DeclaredInProgram; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UsedInFunction; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::CurrentGroupSize; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; - using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalRefValues; - CBasicTestFence(TExplicitAtomicType dataType, bool useSVM) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM) - { - StartValue(0); - OldValueCheck(false); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return threadCount; - } - virtual cl_uint NumNonAtomicVariablesPerThread() - { - if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) - return 1; - if (LocalMemory()) - { - if (gIsEmbedded) - { - if (CurrentGroupSize() > 1024) - CurrentGroupSize(1024); - return 1; //1KB of local memory required by spec. Clamp group size to 1k and allow 1 variable per thread - } - else - return 32 * 1024 / 8 / CurrentGroupSize() - 1; //32KB of local memory required by spec - } - return 256; - } - virtual std::string SingleTestName() - { - std::string testName; - if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) - testName += "seq_cst fence, "; - else - testName += std::string(get_memory_order_type_name(_subCase.op1MemOrder)).substr(sizeof("memory_order")) - + (_subCase.op1IsFence ? " fence" : " atomic") + " synchronizes-with " - + std::string(get_memory_order_type_name(_subCase.op2MemOrder)).substr(sizeof("memory_order")) - + (_subCase.op2IsFence ? " fence" : " atomic") + ", "; - testName += CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); - testName += std::string(", ") + std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); - return testName; - } - virtual bool SVMDataBufferAllSVMConsistent() - { - // Although memory_scope_all_devices doesn't mention SVM it is just an - // alias for memory_scope_all_svm_devices. So both scopes interact with - // SVM allocations, on devices that support those, just the same. - return MemoryScope() == MEMORY_SCOPE_ALL_DEVICES - || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES; - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - // execute 3 (maximum) sub cases for each memory order - for (_subCaseId = 0; _subCaseId < 3; _subCaseId++) +template <typename HostAtomicType, typename HostDataType> +class CBasicTestFence + : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { + struct TestDefinition { - EXECUTE_TEST(error, (CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue))); - } - return error; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(DeclaredInProgram() || UsedInFunction()) - return 0; //skip test - not applicable - no overloaded fence functions for different address spaces - if(MemoryOrder() == MEMORY_ORDER_EMPTY || - MemoryScope() == MEMORY_SCOPE_EMPTY) // empty 'scope' not required since opencl20-openclc-rev15 - return 0; //skip test - not applicable - if((UseSVM() || gHost) - && LocalMemory()) - return 0; // skip test - not applicable for SVM and local memory - struct TestDefinition acqTests[] = { - // {op1IsFence, op1MemOrder, op2IsFence, op2MemOrder} - { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE }, - { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE }, - { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQUIRE } + bool op1IsFence; + TExplicitMemoryOrderType op1MemOrder; + bool op2IsFence; + TExplicitMemoryOrderType op2MemOrder; }; - struct TestDefinition relTests[] = { - { true, MEMORY_ORDER_RELEASE, false, MEMORY_ORDER_ACQUIRE }, - { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL } - }; - struct TestDefinition arTests[] = { - { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL }, - { true, MEMORY_ORDER_ACQ_REL, false, MEMORY_ORDER_ACQUIRE }, - { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQ_REL } - }; - switch (MemoryOrder()) - { - case MEMORY_ORDER_ACQUIRE: - if (_subCaseId >= sizeof(acqTests) / sizeof(struct TestDefinition)) - return 0; - _subCase = acqTests[_subCaseId]; - break; - case MEMORY_ORDER_RELEASE: - if (_subCaseId >= sizeof(relTests) / sizeof(struct TestDefinition)) - return 0; - _subCase = relTests[_subCaseId]; - break; - case MEMORY_ORDER_ACQ_REL: - if (_subCaseId >= sizeof(arTests) / sizeof(struct TestDefinition)) - return 0; - _subCase = arTests[_subCaseId]; - break; - case MEMORY_ORDER_SEQ_CST: - if (_subCaseId != 0) // one special case only - return 0; - break; - default: - return 0; - } - LocalRefValues(LocalMemory()); - return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue); - } - virtual std::string ProgramHeader(cl_uint maxNumDestItems) - { - std::string header; - if(gOldAPI) - { - if(MemoryScope() == MEMORY_SCOPE_EMPTY) - { - header += "#define atomic_work_item_fence(x,y) mem_fence(x)\n"; - } - else - { - header += "#define atomic_work_item_fence(x,y,z) mem_fence(x)\n"; - } - } - return header+CBasicTestMemOrderScope<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems); - } - virtual std::string ProgramCore() - { - std::ostringstream naValues; - naValues << NumNonAtomicVariablesPerThread(); - std::string program, fenceType, nonAtomic; - if (LocalMemory()) - { - program = " size_t myId = get_local_id(0), hisId = get_local_size(0)-1-myId;\n"; - fenceType = "CLK_LOCAL_MEM_FENCE"; - nonAtomic = "localValues"; - } - else + +public: + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::OldValueCheck; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::DeclaredInProgram; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UsedInFunction; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; + using CBasicTestMemOrderScope<HostAtomicType, + HostDataType>::CurrentGroupSize; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::UseSVM; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory; + using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalRefValues; + CBasicTestFence(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, + useSVM) { - program = " size_t myId = tid, hisId = threadCount-1-tid;\n"; - fenceType = "CLK_GLOBAL_MEM_FENCE"; - nonAtomic = "oldValues"; - } - if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) - { - // All threads are divided into pairs. - // Each thread has its own atomic variable and performs the following actions: - // - increments its own variable - // - performs fence operation to propagate its value and to see value from other thread - // - reads value from other thread's variable - // - repeats the above steps when both values are the same (and less than 1000000) - // - stores the last value read from other thread (in additional variable) - // At the end of execution at least one thread should know the last value from other thread - program += std::string("") + - " " + DataType().RegularTypeName() + " myValue = 0, hisValue; \n" - " do {\n" - " myValue++;\n" - " atomic_store_explicit(&destMemory[myId], myValue, memory_order_relaxed" + MemoryScopeStr() + ");\n" - " atomic_work_item_fence(" + fenceType + ", memory_order_seq_cst" + MemoryScopeStr() + "); \n" - " hisValue = atomic_load_explicit(&destMemory[hisId], memory_order_relaxed" + MemoryScopeStr() + ");\n" - " } while(myValue == hisValue && myValue < 1000000);\n" - " " + nonAtomic + "[myId] = hisValue; \n"; + StartValue(0); + OldValueCheck(false); } - else + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) { - // Each thread modifies one of its non-atomic variables, increments value of its atomic variable - // and reads values from another thread in typical synchronizes-with scenario with: - // - non-atomic variable (at index A) modification (value change from 0 to A) - // - release operation (additional fence or within atomic) + atomic variable modification (value A) - // - atomic variable read (value B) + acquire operation (additional fence or within atomic) - // - non-atomic variable (at index B) read (value C) - // Each thread verifies dependency between atomic and non-atomic value read from another thread - // The following condition must be true: B == C - program += std::string("") + - " " + DataType().RegularTypeName() + " myValue = 0, hisAtomicValue, hisValue; \n" - " do {\n" - " myValue++;\n" - " " + nonAtomic + "[myId*" + naValues.str() +"+myValue] = myValue;\n"; - if (_subCase.op1IsFence) - program += std::string("") + - " atomic_work_item_fence(" + fenceType + ", " + get_memory_order_type_name(_subCase.op1MemOrder) + MemoryScopeStr() + "); \n" - " atomic_store_explicit(&destMemory[myId], myValue, memory_order_relaxed" + MemoryScopeStr() + ");\n"; - else - program += std::string("") + - " atomic_store_explicit(&destMemory[myId], myValue, " + get_memory_order_type_name(_subCase.op1MemOrder) + MemoryScopeStr() + ");\n"; - if (_subCase.op2IsFence) - program += std::string("") + - " hisAtomicValue = atomic_load_explicit(&destMemory[hisId], memory_order_relaxed" + MemoryScopeStr() + ");\n" - " atomic_work_item_fence(" + fenceType + ", " + get_memory_order_type_name(_subCase.op2MemOrder) + MemoryScopeStr() + "); \n"; - else - program += std::string("") + - " hisAtomicValue = atomic_load_explicit(&destMemory[hisId], " + get_memory_order_type_name(_subCase.op2MemOrder) + MemoryScopeStr() + ");\n"; - program += - " hisValue = " + nonAtomic + "[hisId*" + naValues.str() + "+hisAtomicValue]; \n"; - if (LocalMemory()) - program += " hisId = (hisId+1)%get_local_size(0);\n"; - else - program += " hisId = (hisId+1)%threadCount;\n"; - program += - " } while(hisAtomicValue == hisValue && myValue < "+naValues.str()+"-1);\n" - " if(hisAtomicValue != hisValue)\n" - " { // fail\n" - " atomic_store(&destMemory[myId], myValue-1);\n"; - if (LocalMemory()) - program += " hisId = (hisId+get_local_size(0)-1)%get_local_size(0);\n"; - else - program += " hisId = (hisId+threadCount-1)%threadCount;\n"; - program += - " if(myValue+1 < " + naValues.str() + ")\n" - " " + nonAtomic + "[myId*" + naValues.str() + "+myValue+1] = hisId;\n" - " if(myValue+2 < " + naValues.str() + ")\n" - " " + nonAtomic + "[myId*" + naValues.str() + "+myValue+2] = hisAtomicValue;\n" - " if(myValue+3 < " + naValues.str() + ")\n" - " " + nonAtomic + "[myId*" + naValues.str() + "+myValue+3] = hisValue;\n"; - if (gDebug) - { - program += - " printf(\"WI %d: atomic value (%d) at index %d is different than non-atomic value (%d)\\n\", tid, hisAtomicValue, hisId, hisValue);\n"; - } - program += - " }\n"; - } - return program; - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - size_t myId = tid, hisId = threadCount - 1 - tid; - if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) - { - HostDataType myValue = 0, hisValue; - // CPU thread typically starts faster - wait for GPU thread - myValue++; - host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_SEQ_CST); - while (host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_SEQ_CST) == 0); - do { - myValue++; - host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_RELAXED); - host_atomic_thread_fence(MemoryOrder()); - hisValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_RELAXED); - } while (myValue == hisValue && hisValue < 1000000); - oldValues[tid] = hisValue; + return threadCount; } - else + virtual cl_uint NumNonAtomicVariablesPerThread() { - HostDataType myValue = 0, hisAtomicValue, hisValue; - do { - myValue++; - oldValues[myId*NumNonAtomicVariablesPerThread()+myValue] = myValue; - if (_subCase.op1IsFence) + if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) return 1; + if (LocalMemory()) { - host_atomic_thread_fence(_subCase.op1MemOrder); - host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, MEMORY_ORDER_RELAXED); + if (gIsEmbedded) + { + if (CurrentGroupSize() > 512) CurrentGroupSize(512); + return 2; // 1KB of local memory required by spec. Clamp group + // size to 512 and allow 2 variables per thread + } + else + return 32 * 1024 / 8 / CurrentGroupSize() + - 1; // 32KB of local memory required by spec } + return 256; + } + virtual std::string SingleTestName() + { + std::string testName; + if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) + testName += "seq_cst fence, "; else - host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue, _subCase.op1MemOrder); - if (_subCase.op2IsFence) + testName += + std::string(get_memory_order_type_name(_subCase.op1MemOrder)) + .substr(sizeof("memory_order")) + + (_subCase.op1IsFence ? " fence" : " atomic") + + " synchronizes-with " + + std::string(get_memory_order_type_name(_subCase.op2MemOrder)) + .substr(sizeof("memory_order")) + + (_subCase.op2IsFence ? " fence" : " atomic") + ", "; + testName += CBasicTest<HostAtomicType, HostDataType>::SingleTestName(); + testName += std::string(", ") + + std::string(get_memory_scope_type_name(MemoryScope())) + .substr(sizeof("memory")); + return testName; + } + virtual bool SVMDataBufferAllSVMConsistent() + { + // Although memory_scope_all_devices doesn't mention SVM it is just an + // alias for memory_scope_all_svm_devices. So both scopes interact with + // SVM allocations, on devices that support those, just the same. + return MemoryScope() == MEMORY_SCOPE_ALL_DEVICES + || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES; + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + int error = 0; + // execute 3 (maximum) sub cases for each memory order + for (_subCaseId = 0; _subCaseId < 3; _subCaseId++) { - hisAtomicValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], MEMORY_ORDER_RELAXED); - host_atomic_thread_fence(_subCase.op2MemOrder); + EXECUTE_TEST( + error, + (CBasicTestMemOrderScope<HostAtomicType, HostDataType>:: + ExecuteForEachParameterSet(deviceID, context, queue))); } - else - hisAtomicValue = host_atomic_load<HostAtomicType, HostDataType>(&destMemory[hisId], _subCase.op2MemOrder); - hisValue = oldValues[hisId*NumNonAtomicVariablesPerThread() + hisAtomicValue]; - hisId = (hisId + 1) % threadCount; - } while(hisAtomicValue == hisValue && myValue < (HostDataType)NumNonAtomicVariablesPerThread()-1); - if(hisAtomicValue != hisValue) - { // fail - host_atomic_store<HostAtomicType, HostDataType>(&destMemory[myId], myValue-1, MEMORY_ORDER_SEQ_CST); - if (gDebug) + return error; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (DeclaredInProgram() || UsedInFunction()) + return 0; // skip test - not applicable - no overloaded fence + // functions for different address spaces + if (MemoryOrder() == MEMORY_ORDER_EMPTY + || MemoryScope() + == MEMORY_SCOPE_EMPTY) // empty 'scope' not required since + // opencl20-openclc-rev15 + return 0; // skip test - not applicable + if ((UseSVM() || gHost) && LocalMemory()) + return 0; // skip test - not applicable for SVM and local memory + struct TestDefinition acqTests[] = { + // {op1IsFence, op1MemOrder, op2IsFence, op2MemOrder} + { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE }, + { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQUIRE }, + { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQUIRE } + }; + struct TestDefinition relTests[] = { + { true, MEMORY_ORDER_RELEASE, false, MEMORY_ORDER_ACQUIRE }, + { true, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL } + }; + struct TestDefinition arTests[] = { + { false, MEMORY_ORDER_RELEASE, true, MEMORY_ORDER_ACQ_REL }, + { true, MEMORY_ORDER_ACQ_REL, false, MEMORY_ORDER_ACQUIRE }, + { true, MEMORY_ORDER_ACQ_REL, true, MEMORY_ORDER_ACQ_REL } + }; + switch (MemoryOrder()) { - hisId = (hisId + threadCount - 1) % threadCount; - printf("WI %d: atomic value (%d) at index %d is different than non-atomic value (%d)\n", tid, hisAtomicValue, hisId, hisValue); + case MEMORY_ORDER_ACQUIRE: + if (_subCaseId + >= sizeof(acqTests) / sizeof(struct TestDefinition)) + return 0; + _subCase = acqTests[_subCaseId]; + break; + case MEMORY_ORDER_RELEASE: + if (_subCaseId + >= sizeof(relTests) / sizeof(struct TestDefinition)) + return 0; + _subCase = relTests[_subCaseId]; + break; + case MEMORY_ORDER_ACQ_REL: + if (_subCaseId + >= sizeof(arTests) / sizeof(struct TestDefinition)) + return 0; + _subCase = arTests[_subCaseId]; + break; + case MEMORY_ORDER_SEQ_CST: + if (_subCaseId != 0) // one special case only + return 0; + break; + default: return 0; } - } - } - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - for(cl_uint i = 0 ; i < threadCount*NumNonAtomicVariablesPerThread(); i++) - startRefValues[i] = 0; - return true; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - correct = true; - cl_uint workSize = LocalMemory() ? CurrentGroupSize() : threadCount; - for(cl_uint workOffset = 0; workOffset < threadCount; workOffset+= workSize) - { - if(workOffset+workSize > threadCount) - // last workgroup (host threads) - workSize = threadCount-workOffset; - for(cl_uint i = 0 ; i < workSize && workOffset+i < threadCount; i++) - { - HostAtomicType myValue = finalValues[workOffset + i]; - if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) + LocalRefValues(LocalMemory()); + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + virtual std::string ProgramHeader(cl_uint maxNumDestItems) + { + std::string header; + if (gOldAPI) { - HostDataType hisValue = refValues[workOffset + i]; - if (myValue == hisValue) - { - // a draw - both threads should reach final value 1000000 - if (myValue != 1000000) + if (MemoryScope() == MEMORY_SCOPE_EMPTY) { - log_error("ERROR: Invalid reference value #%u (%d instead of 1000000)\n", workOffset + i, myValue); - correct = false; - return true; + header += "#define atomic_work_item_fence(x,y) " + " mem_fence(x)\n"; } - } - else - { - //slower thread (in total order of seq_cst operations) must know last value written by faster thread - HostAtomicType hisRealValue = finalValues[workOffset + workSize - 1 - i]; - HostDataType myValueReadByHim = refValues[workOffset + workSize - 1 - i]; - - // who is the winner? - thread with lower private counter value - if (myValue == hisRealValue) // forbidden result - fence doesn't work + else { - log_error("ERROR: Atomic counter values #%u and #%u are the same (%u)\n", workOffset + i, workOffset + workSize - 1 - i, myValue); - log_error("ERROR: Both threads have outdated values read from another thread (%u and %u)\n", hisValue, myValueReadByHim); - correct = false; - return true; + header += "#define atomic_work_item_fence(x,y,z) " + " mem_fence(x)\n"; } - if (myValue > hisRealValue) // I'm slower + } + return header + + CBasicTestMemOrderScope<HostAtomicType, HostDataType>:: + ProgramHeader(maxNumDestItems); + } + virtual std::string ProgramCore() + { + std::ostringstream naValues; + naValues << NumNonAtomicVariablesPerThread(); + std::string program, fenceType, nonAtomic; + if (LocalMemory()) + { + program = " size_t myId = get_local_id(0), hisId = " + "get_local_size(0)-1-myId;\n"; + fenceType = "CLK_LOCAL_MEM_FENCE"; + nonAtomic = "localValues"; + } + else + { + program = " size_t myId = tid, hisId = threadCount-1-tid;\n"; + fenceType = "CLK_GLOBAL_MEM_FENCE"; + nonAtomic = "oldValues"; + } + if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) + { + // All threads are divided into pairs. + // Each thread has its own atomic variable and performs the + // following actions: + // - increments its own variable + // - performs fence operation to propagate its value and to see + // value from other thread + // - reads value from other thread's variable + // - repeats the above steps when both values are the same (and less + // than 1000000) + // - stores the last value read from other thread (in additional + // variable) At the end of execution at least one thread should know + // the last value from other thread + program += std::string("") + " " + DataType().RegularTypeName() + + " myValue = 0, hisValue; \n" + " do {\n" + " myValue++;\n" + " atomic_store_explicit(&destMemory[myId], myValue, " + "memory_order_relaxed" + + MemoryScopeStr() + + ");\n" + " atomic_work_item_fence(" + + fenceType + ", memory_order_seq_cst" + MemoryScopeStr() + + "); \n" + " hisValue = atomic_load_explicit(&destMemory[hisId], " + "memory_order_relaxed" + + MemoryScopeStr() + + ");\n" + " } while(myValue == hisValue && myValue < 1000000);\n" + " " + + nonAtomic + "[myId] = hisValue; \n"; + } + else + { + // Each thread modifies one of its non-atomic variables, increments + // value of its atomic variable and reads values from another thread + // in typical synchronizes-with scenario with: + // - non-atomic variable (at index A) modification (value change + // from 0 to A) + // - release operation (additional fence or within atomic) + atomic + // variable modification (value A) + // - atomic variable read (value B) + acquire operation (additional + // fence or within atomic) + // - non-atomic variable (at index B) read (value C) + // Each thread verifies dependency between atomic and non-atomic + // value read from another thread The following condition must be + // true: B == C + program += std::string("") + " " + DataType().RegularTypeName() + + " myValue = 0, hisAtomicValue, hisValue; \n" + " do {\n" + " myValue++;\n" + " " + + nonAtomic + "[myId*" + naValues.str() + + "+myValue] = myValue;\n"; + if (_subCase.op1IsFence) + program += std::string("") + " atomic_work_item_fence(" + + fenceType + ", " + + get_memory_order_type_name(_subCase.op1MemOrder) + + MemoryScopeStr() + + "); \n" + " atomic_store_explicit(&destMemory[myId], myValue, " + "memory_order_relaxed" + + MemoryScopeStr() + ");\n"; + else + program += std::string("") + + " atomic_store_explicit(&destMemory[myId], myValue, " + + get_memory_order_type_name(_subCase.op1MemOrder) + + MemoryScopeStr() + ");\n"; + if (_subCase.op2IsFence) + program += std::string("") + + " hisAtomicValue = " + "atomic_load_explicit(&destMemory[hisId], " + "memory_order_relaxed" + + MemoryScopeStr() + + ");\n" + " atomic_work_item_fence(" + + fenceType + ", " + + get_memory_order_type_name(_subCase.op2MemOrder) + + MemoryScopeStr() + "); \n"; + else + program += std::string("") + + " hisAtomicValue = " + "atomic_load_explicit(&destMemory[hisId], " + + get_memory_order_type_name(_subCase.op2MemOrder) + + MemoryScopeStr() + ");\n"; + program += " hisValue = " + nonAtomic + "[hisId*" + + naValues.str() + "+hisAtomicValue]; \n"; + if (LocalMemory()) + program += " hisId = (hisId+1)%get_local_size(0);\n"; + else + program += " hisId = (hisId+1)%threadCount;\n"; + program += " } while(hisAtomicValue == hisValue && myValue < " + + naValues.str() + + "-1);\n" + " if(hisAtomicValue != hisValue)\n" + " { // fail\n" + " atomic_store(&destMemory[myId], myValue-1);\n"; + if (LocalMemory()) + program += " hisId = " + "(hisId+get_local_size(0)-1)%get_local_size(0);\n"; + else + program += " hisId = (hisId+threadCount-1)%threadCount;\n"; + program += " if(myValue+1 < " + naValues.str() + + ")\n" + " " + + nonAtomic + "[myId*" + naValues.str() + + "+myValue+1] = hisId;\n" + " if(myValue+2 < " + + naValues.str() + + ")\n" + " " + + nonAtomic + "[myId*" + naValues.str() + + "+myValue+2] = hisAtomicValue;\n" + " if(myValue+3 < " + + naValues.str() + + ")\n" + " " + + nonAtomic + "[myId*" + naValues.str() + + "+myValue+3] = hisValue;\n"; + if (gDebug) { - if (hisRealValue != hisValue) - { - log_error("ERROR: Invalid reference value #%u (%d instead of %d)\n", workOffset + i, hisValue, hisRealValue); - log_error("ERROR: Slower thread #%u should know value written by faster thread #%u\n", workOffset + i, workOffset + workSize - 1 - i); - correct = false; - return true; - } + program += " printf(\"WI %d: atomic value (%d) at index %d " + "is different than non-atomic value (%d)\\n\", tid, " + "hisAtomicValue, hisId, hisValue);\n"; } - else // I'm faster + program += " }\n"; + } + return program; + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + size_t myId = tid, hisId = threadCount - 1 - tid; + if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) + { + HostDataType myValue = 0, hisValue; + // CPU thread typically starts faster - wait for GPU thread + myValue++; + host_atomic_store<HostAtomicType, HostDataType>( + &destMemory[myId], myValue, MEMORY_ORDER_SEQ_CST); + while (host_atomic_load<HostAtomicType, HostDataType>( + &destMemory[hisId], MEMORY_ORDER_SEQ_CST) + == 0) + ; + do { - if (myValueReadByHim != myValue) - { - log_error("ERROR: Invalid reference value #%u (%d instead of %d)\n", workOffset + workSize - 1 - i, myValueReadByHim, myValue); - log_error("ERROR: Slower thread #%u should know value written by faster thread #%u\n", workOffset + workSize - 1 - i, workOffset + i); - correct = false; - return true; - } - } - } + myValue++; + host_atomic_store<HostAtomicType, HostDataType>( + &destMemory[myId], myValue, MEMORY_ORDER_RELAXED); + host_atomic_thread_fence(MemoryOrder()); + hisValue = host_atomic_load<HostAtomicType, HostDataType>( + &destMemory[hisId], MEMORY_ORDER_RELAXED); + } while (myValue == hisValue && hisValue < 1000000); + oldValues[tid] = hisValue; } else { - if (myValue != NumNonAtomicVariablesPerThread()-1) - { - log_error("ERROR: Invalid atomic value #%u (%d instead of %d)\n", workOffset + i, myValue, NumNonAtomicVariablesPerThread()-1); - log_error("ERROR: Thread #%u observed invalid values in other thread's variables\n", workOffset + i, myValue); - correct = false; - return true; - } + HostDataType myValue = 0, hisAtomicValue, hisValue; + do + { + myValue++; + oldValues[myId * NumNonAtomicVariablesPerThread() + myValue] = + myValue; + if (_subCase.op1IsFence) + { + host_atomic_thread_fence(_subCase.op1MemOrder); + host_atomic_store<HostAtomicType, HostDataType>( + &destMemory[myId], myValue, MEMORY_ORDER_RELAXED); + } + else + host_atomic_store<HostAtomicType, HostDataType>( + &destMemory[myId], myValue, _subCase.op1MemOrder); + if (_subCase.op2IsFence) + { + hisAtomicValue = + host_atomic_load<HostAtomicType, HostDataType>( + &destMemory[hisId], MEMORY_ORDER_RELAXED); + host_atomic_thread_fence(_subCase.op2MemOrder); + } + else + hisAtomicValue = + host_atomic_load<HostAtomicType, HostDataType>( + &destMemory[hisId], _subCase.op2MemOrder); + hisValue = oldValues[hisId * NumNonAtomicVariablesPerThread() + + hisAtomicValue]; + hisId = (hisId + 1) % threadCount; + } while (hisAtomicValue == hisValue + && myValue + < (HostDataType)NumNonAtomicVariablesPerThread() - 1); + if (hisAtomicValue != hisValue) + { // fail + host_atomic_store<HostAtomicType, HostDataType>( + &destMemory[myId], myValue - 1, MEMORY_ORDER_SEQ_CST); + if (gDebug) + { + hisId = (hisId + threadCount - 1) % threadCount; + printf("WI %d: atomic value (%d) at index %d is different " + "than non-atomic value (%d)\n", + tid, hisAtomicValue, hisId, hisValue); + } + } + } + } + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) + { + for (cl_uint i = 0; i < threadCount * NumNonAtomicVariablesPerThread(); + i++) + startRefValues[i] = 0; + return true; + } + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) + { + correct = true; + cl_uint workSize = LocalMemory() ? CurrentGroupSize() : threadCount; + for (cl_uint workOffset = 0; workOffset < threadCount; + workOffset += workSize) + { + if (workOffset + workSize > threadCount) + // last workgroup (host threads) + workSize = threadCount - workOffset; + for (cl_uint i = 0; i < workSize && workOffset + i < threadCount; + i++) + { + HostAtomicType myValue = finalValues[workOffset + i]; + if (MemoryOrder() == MEMORY_ORDER_SEQ_CST) + { + HostDataType hisValue = refValues[workOffset + i]; + if (myValue == hisValue) + { + // a draw - both threads should reach final value + // 1000000 + if (myValue != 1000000) + { + log_error("ERROR: Invalid reference value #%u (%d " + "instead of 1000000)\n", + workOffset + i, myValue); + correct = false; + return true; + } + } + else + { + // slower thread (in total order of seq_cst operations) + // must know last value written by faster thread + HostAtomicType hisRealValue = + finalValues[workOffset + workSize - 1 - i]; + HostDataType myValueReadByHim = + refValues[workOffset + workSize - 1 - i]; + + // who is the winner? - thread with lower private + // counter value + if (myValue == hisRealValue) // forbidden result - fence + // doesn't work + { + log_error("ERROR: Atomic counter values #%u and " + "#%u are the same (%u)\n", + workOffset + i, + workOffset + workSize - 1 - i, myValue); + log_error( + "ERROR: Both threads have outdated values read " + "from another thread (%u and %u)\n", + hisValue, myValueReadByHim); + correct = false; + return true; + } + if (myValue > hisRealValue) // I'm slower + { + if (hisRealValue != hisValue) + { + log_error("ERROR: Invalid reference value #%u " + "(%d instead of %d)\n", + workOffset + i, hisValue, + hisRealValue); + log_error( + "ERROR: Slower thread #%u should know " + "value written by faster thread #%u\n", + workOffset + i, + workOffset + workSize - 1 - i); + correct = false; + return true; + } + } + else // I'm faster + { + if (myValueReadByHim != myValue) + { + log_error("ERROR: Invalid reference value #%u " + "(%d instead of %d)\n", + workOffset + workSize - 1 - i, + myValueReadByHim, myValue); + log_error( + "ERROR: Slower thread #%u should know " + "value written by faster thread #%u\n", + workOffset + workSize - 1 - i, + workOffset + i); + correct = false; + return true; + } + } + } + } + else + { + if (myValue != NumNonAtomicVariablesPerThread() - 1) + { + log_error("ERROR: Invalid atomic value #%u (%d instead " + "of %d)\n", + workOffset + i, myValue, + NumNonAtomicVariablesPerThread() - 1); + log_error("ERROR: Thread #%u observed invalid values " + "in other thread's variables\n", + workOffset + i, myValue); + correct = false; + return true; + } + } + } } - } + return true; } - return true; - } + private: - int _subCaseId; - struct TestDefinition _subCase; + int _subCaseId; + struct TestDefinition _subCase; }; -int test_atomic_fence_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, bool useSVM) +int test_atomic_fence_generic(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, + bool useSVM) { - int error = 0; - CBasicTestFence<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, useSVM); - EXECUTE_TEST(error, test_int.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, useSVM); - EXECUTE_TEST(error, test_uint.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, useSVM); - EXECUTE_TEST(error, test_long.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, useSVM); - EXECUTE_TEST(error, test_ulong.Execute(deviceID, context, queue, num_elements)); - if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) - { - CBasicTestFence<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - else - { - CBasicTestFence<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t(TYPE_ATOMIC_INTPTR_T, useSVM); - EXECUTE_TEST(error, test_intptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); - EXECUTE_TEST(error, test_uintptr_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t(TYPE_ATOMIC_SIZE_T, useSVM); - EXECUTE_TEST(error, test_size_t.Execute(deviceID, context, queue, num_elements)); - CBasicTestFence<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); - EXECUTE_TEST(error, test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); - } - return error; + int error = 0; + CBasicTestFence<HOST_ATOMIC_INT, HOST_INT> test_int(TYPE_ATOMIC_INT, + useSVM); + EXECUTE_TEST(error, + test_int.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_UINT, HOST_UINT> test_uint(TYPE_ATOMIC_UINT, + useSVM); + EXECUTE_TEST(error, + test_uint.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_LONG, HOST_LONG> test_long(TYPE_ATOMIC_LONG, + useSVM); + EXECUTE_TEST(error, + test_long.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_ULONG, HOST_ULONG> test_ulong(TYPE_ATOMIC_ULONG, + useSVM); + EXECUTE_TEST(error, + test_ulong.Execute(deviceID, context, queue, num_elements)); + if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) + { + CBasicTestFence<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_UINTPTR_T32, HOST_UINTPTR_T32> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_SIZE_T32, HOST_SIZE_T32> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_PTRDIFF_T32, HOST_PTRDIFF_T32> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + else + { + CBasicTestFence<HOST_ATOMIC_INTPTR_T64, HOST_INTPTR_T64> test_intptr_t( + TYPE_ATOMIC_INTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_intptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_UINTPTR_T64, HOST_UINTPTR_T64> + test_uintptr_t(TYPE_ATOMIC_UINTPTR_T, useSVM); + EXECUTE_TEST( + error, + test_uintptr_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_SIZE_T64, HOST_SIZE_T64> test_size_t( + TYPE_ATOMIC_SIZE_T, useSVM); + EXECUTE_TEST( + error, test_size_t.Execute(deviceID, context, queue, num_elements)); + CBasicTestFence<HOST_ATOMIC_PTRDIFF_T64, HOST_PTRDIFF_T64> + test_ptrdiff_t(TYPE_ATOMIC_PTRDIFF_T, useSVM); + EXECUTE_TEST( + error, + test_ptrdiff_t.Execute(deviceID, context, queue, num_elements)); + } + return error; } -int test_atomic_fence(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_atomic_fence(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fence_generic(deviceID, context, queue, num_elements, false); + return test_atomic_fence_generic(deviceID, context, queue, num_elements, + false); } -int test_svm_atomic_fence(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_svm_atomic_fence(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - return test_atomic_fence_generic(deviceID, context, queue, num_elements, true); + return test_atomic_fence_generic(deviceID, context, queue, num_elements, + true); } diff --git a/test_conformance/commonfns/test_sign.cpp b/test_conformance/commonfns/test_sign.cpp index 1b842e35..6dba58da 100644 --- a/test_conformance/commonfns/test_sign.cpp +++ b/test_conformance/commonfns/test_sign.cpp @@ -223,14 +223,13 @@ test_sign(cl_device_id device, cl_context context, cl_command_queue queue, int n free(input_ptr[0]); free(output_ptr); - if(err) - return err; + if (err) return err; - if( ! is_extension_available( device, "cl_khr_fp64")) - { - log_info( "skipping double test -- cl_khr_fp64 not supported.\n" ); - return 0; - } + if (!is_extension_available(device, "cl_khr_fp64")) + { + log_info("skipping double test -- cl_khr_fp64 not supported.\n"); + return 0; + } return test_sign_double( device, context, queue, n_elems); } diff --git a/test_conformance/commonfns/test_step.cpp b/test_conformance/commonfns/test_step.cpp index 0e3cfe07..330083b2 100644 --- a/test_conformance/commonfns/test_step.cpp +++ b/test_conformance/commonfns/test_step.cpp @@ -158,23 +158,20 @@ test_step(cl_device_id device, cl_context context, cl_command_queue queue, int n } err = create_single_kernel_helper( context, &program[0], &kernel[0], 1, &step_kernel_code, "test_step" ); - if (err) - return -1; + if (err) return -1; err = create_single_kernel_helper( context, &program[1], &kernel[1], 1, &step2_kernel_code, "test_step2" ); - if (err) - return -1; + if (err) return -1; err = create_single_kernel_helper( context, &program[2], &kernel[2], 1, &step4_kernel_code, "test_step4" ); - if (err) - return -1; - err = create_single_kernel_helper( context, &program[3], &kernel[3], 1, &step8_kernel_code, "test_step8" ); - if (err) - return -1; - err = create_single_kernel_helper( context, &program[4], &kernel[4], 1, &step16_kernel_code, "test_step16" ); - if (err) - return -1; - err = create_single_kernel_helper( context, &program[5], &kernel[5], 1, &step3_kernel_code, "test_step3" ); - if (err) - return -1; + if (err) return -1; + err = create_single_kernel_helper(context, &program[3], &kernel[3], 1, + &step8_kernel_code, "test_step8"); + if (err) return -1; + err = create_single_kernel_helper(context, &program[4], &kernel[4], 1, + &step16_kernel_code, "test_step16"); + if (err) return -1; + err = create_single_kernel_helper(context, &program[5], &kernel[5], 1, + &step3_kernel_code, "test_step3"); + if (err) return -1; values[0] = streams[0]; values[1] = streams[1]; diff --git a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp index 483adac9..b95b0f53 100644 --- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp +++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp @@ -20,7 +20,7 @@ #include <unistd.h> #endif - +// List should follow order in the extension spec const char *known_extensions[] = { "cl_khr_byte_addressable_store", "cl_khr_3d_image_writes", @@ -42,6 +42,7 @@ const char *known_extensions[] = { "cl_khr_mipmap_image_writes", "cl_khr_srgb_image_writes", "cl_khr_subgroup_named_barrier", + "cl_khr_extended_async_copies", "cl_khr_subgroup_extended_types", "cl_khr_subgroup_non_uniform_vote", "cl_khr_subgroup_ballot", @@ -49,7 +50,9 @@ const char *known_extensions[] = { "cl_khr_subgroup_shuffle", "cl_khr_subgroup_shuffle_relative", "cl_khr_subgroup_clustered_reduce", - + "cl_khr_extended_bit_ops", + "cl_khr_integer_dot_product", + "cl_khr_subgroup_rotate", // API-only extensions after this point. If you add above here, modify // first_API_extension below. "cl_khr_icd", @@ -71,10 +74,23 @@ const char *known_extensions[] = { "cl_khr_spirv_no_integer_wrap_decoration", "cl_khr_extended_versioning", "cl_khr_device_uuid", + "cl_khr_pci_bus_info", + "cl_khr_suggested_local_work_size", + "cl_khr_spirv_linkonce_odr", + "cl_khr_semaphore", + "cl_khr_external_semaphore", + "cl_khr_external_semaphore_win32", + "cl_khr_external_semaphore_sync_fd", + "cl_khr_external_semaphore_opaque_fd", + "cl_khr_external_memory", + "cl_khr_external_memory_win32", + "cl_khr_external_memory_opaque_fd", + "cl_khr_command_buffer", + "cl_khr_command_buffer_mutable_dispatch", }; -size_t num_known_extensions = sizeof(known_extensions)/sizeof(char*); -size_t first_API_extension = 27; +size_t num_known_extensions = ARRAY_SIZE(known_extensions); +size_t first_API_extension = 31; const char *known_embedded_extensions[] = { "cles_khr_int64", @@ -314,8 +330,15 @@ int test_compiler_defines_for_extensions(cl_device_id device, cl_context context } // Build the kernel - char *kernel_code = (char*)malloc(1025*256*(num_not_supported_extensions+num_of_supported_extensions)); - memset(kernel_code, 0, 1025*256*(num_not_supported_extensions+num_of_supported_extensions)); + char *kernel_code = (char *)malloc( + 1 + + 1025 * 256 + * (num_not_supported_extensions + num_of_supported_extensions)); + memset( + kernel_code, 0, + 1 + + 1025 * 256 + * (num_not_supported_extensions + num_of_supported_extensions)); int i, index = 0; strcat(kernel_code, kernel_strings[0]); @@ -340,8 +363,6 @@ int test_compiler_defines_for_extensions(cl_device_id device, cl_context context clProgramWrapper program; clKernelWrapper kernel; - Version version = get_device_cl_version(device); - error = create_single_kernel_helper(context, &program, &kernel, 1, (const char **)&kernel_code, "test"); test_error(error, "create_single_kernel_helper failed"); diff --git a/test_conformance/compiler/test_feature_macro.cpp b/test_conformance/compiler/test_feature_macro.cpp index ac355dd4..ef3c0028 100644 --- a/test_conformance/compiler/test_feature_macro.cpp +++ b/test_conformance/compiler/test_feature_macro.cpp @@ -579,6 +579,78 @@ int test_feature_macro_fp64(cl_device_id deviceID, cl_context context, compiler_status, supported); } +int test_feature_macro_integer_dot_product_input_4x8bit_packed( + cl_device_id deviceID, cl_context context, std::string test_macro_name, + cl_bool& supported) +{ + cl_int error = TEST_FAIL; + cl_bool api_status; + cl_bool compiler_status; + log_info("\n%s ...\n", test_macro_name.c_str()); + + if (!is_extension_available(deviceID, "cl_khr_integer_dot_product")) + { + supported = false; + return TEST_PASS; + } + + error = check_api_feature_info_capabilities< + cl_device_integer_dot_product_capabilities_khr>( + deviceID, context, api_status, + CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, + CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR); + if (error != CL_SUCCESS) + { + return error; + } + + error = check_compiler_feature_info(deviceID, context, test_macro_name, + compiler_status); + if (error != CL_SUCCESS) + { + return error; + } + + return feature_macro_verify_results(test_macro_name, api_status, + compiler_status, supported); +} + +int test_feature_macro_integer_dot_product_input_4x8bit( + cl_device_id deviceID, cl_context context, std::string test_macro_name, + cl_bool& supported) +{ + cl_int error = TEST_FAIL; + cl_bool api_status; + cl_bool compiler_status; + log_info("\n%s ...\n", test_macro_name.c_str()); + + if (!is_extension_available(deviceID, "cl_khr_integer_dot_product")) + { + supported = false; + return TEST_PASS; + } + + error = check_api_feature_info_capabilities< + cl_device_integer_dot_product_capabilities_khr>( + deviceID, context, api_status, + CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, + CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR); + if (error != CL_SUCCESS) + { + return error; + } + + error = check_compiler_feature_info(deviceID, context, test_macro_name, + compiler_status); + if (error != CL_SUCCESS) + { + return error; + } + + return feature_macro_verify_results(test_macro_name, api_status, + compiler_status, supported); +} + int test_feature_macro_int64(cl_device_id deviceID, cl_context context, std::string test_macro_name, cl_bool& supported) { @@ -686,15 +758,6 @@ int test_consistency_c_features_list(cl_device_id deviceID, sort(vec_to_cmp.begin(), vec_to_cmp.end()); sort(vec_device_feature_names.begin(), vec_device_feature_names.end()); - if (vec_device_feature_names == vec_to_cmp) - { - log_info("Comparison list of features - passed\n"); - } - else - { - log_info("Comparison list of features - failed\n"); - error = TEST_FAIL; - } log_info( "Supported features based on CL_DEVICE_OPENCL_C_FEATURES API query:\n"); for (auto each_f : vec_device_feature_names) @@ -703,11 +766,26 @@ int test_consistency_c_features_list(cl_device_id deviceID, } log_info("\nSupported features based on queries to API/compiler :\n"); + for (auto each_f : vec_to_cmp) { log_info("%s\n", each_f.c_str()); } + for (auto each_f : vec_to_cmp) + { + if (find(vec_device_feature_names.begin(), + vec_device_feature_names.end(), each_f) + == vec_device_feature_names.end()) + { + log_info("Comparison list of features - failed - missing %s\n", + each_f.c_str()); + return TEST_FAIL; + } + } + + log_info("Comparison list of features - passed\n"); + return error; } @@ -748,6 +826,8 @@ int test_features_macro(cl_device_id deviceID, cl_context context, NEW_FEATURE_MACRO_TEST(images); NEW_FEATURE_MACRO_TEST(fp64); NEW_FEATURE_MACRO_TEST(int64); + NEW_FEATURE_MACRO_TEST(integer_dot_product_input_4x8bit); + NEW_FEATURE_MACRO_TEST(integer_dot_product_input_4x8bit_packed); error |= test_consistency_c_features_list(deviceID, supported_features_vec); diff --git a/test_conformance/computeinfo/CMakeLists.txt b/test_conformance/computeinfo/CMakeLists.txt index 207223a3..06f0599c 100644 --- a/test_conformance/computeinfo/CMakeLists.txt +++ b/test_conformance/computeinfo/CMakeLists.txt @@ -5,6 +5,7 @@ set(${MODULE_NAME}_SOURCES device_uuid.cpp extended_versioning.cpp conforming_version.cpp + pci_bus_info.cpp ) include(../CMakeCommon.txt) diff --git a/test_conformance/computeinfo/device_uuid.cpp b/test_conformance/computeinfo/device_uuid.cpp index 1ef9dad2..7f29d0b6 100644 --- a/test_conformance/computeinfo/device_uuid.cpp +++ b/test_conformance/computeinfo/device_uuid.cpp @@ -105,7 +105,7 @@ int test_device_uuid(cl_device_id deviceID, cl_context context, if (!is_extension_available(deviceID, "cl_khr_device_uuid")) { log_info("cl_khr_device_uuid not supported. Skipping test...\n"); - return 0; + return TEST_SKIPPED_ITSELF; } int total_errors = 0; diff --git a/test_conformance/computeinfo/main.cpp b/test_conformance/computeinfo/main.cpp index 4860b445..382cd6a3 100644 --- a/test_conformance/computeinfo/main.cpp +++ b/test_conformance/computeinfo/main.cpp @@ -95,8 +95,8 @@ typedef struct _version version_t; struct _extensions { - int cl_khr_fp64; - int cl_khr_fp16; + int has_cl_khr_fp64; + int has_cl_khr_fp16; }; typedef struct _extensions extensions_t; @@ -908,12 +908,6 @@ void dumpConfigInfo(config_info* info) { cl_name_version new_version_item = info->config.cl_name_version_array[f]; - cl_version new_version_major = - CL_VERSION_MAJOR_KHR(new_version_item.version); - cl_version new_version_minor = - CL_VERSION_MINOR_KHR(new_version_item.version); - cl_version new_version_patch = - CL_VERSION_PATCH_KHR(new_version_item.version); log_info("\t\t\"%s\" %d.%d.%d\n", new_version_item.name, CL_VERSION_MAJOR_KHR(new_version_item.version), CL_VERSION_MINOR_KHR(new_version_item.version), @@ -1069,11 +1063,11 @@ int parseExtensions(char const* str, extensions_t* extensions) } if (strncmp(begin, "cl_khr_fp64", length) == 0) { - extensions->cl_khr_fp64 = 1; + extensions->has_cl_khr_fp64 = 1; } if (strncmp(begin, "cl_khr_fp16", length) == 0) { - extensions->cl_khr_fp16 = 1; + extensions->has_cl_khr_fp16 = 1; } begin += length; // Skip word. if (begin[0] == ' ') @@ -1112,13 +1106,13 @@ int getConfigInfos(cl_device_id device) // version 1.1, we have to check doubles are sopported. In // OpenCL 1.2 CL_DEVICE_DOUBLE_FP_CONFIG should be reported // unconditionally. - get = extensions.cl_khr_fp64; + get = extensions.has_cl_khr_fp64; }; if (info.opcode == CL_DEVICE_HALF_FP_CONFIG) { // CL_DEVICE_HALF_FP_CONFIG should be reported only when cl_khr_fp16 // extension is available - get = extensions.cl_khr_fp16; + get = extensions.has_cl_khr_fp16; }; if (get) { @@ -1421,15 +1415,16 @@ int test_computeinfo(cl_device_id deviceID, cl_context context, extern int test_extended_versioning(cl_device_id, cl_context, cl_command_queue, int); extern int test_device_uuid(cl_device_id, cl_context, cl_command_queue, int); - extern int test_conformance_version(cl_device_id, cl_context, cl_command_queue, int); +extern int test_pci_bus_info(cl_device_id, cl_context, cl_command_queue, int); test_definition test_list[] = { ADD_TEST(computeinfo), ADD_TEST(extended_versioning), ADD_TEST(device_uuid), ADD_TEST_VERSION(conformance_version, Version(3, 0)), + ADD_TEST(pci_bus_info), }; const int test_num = ARRAY_SIZE(test_list); diff --git a/test_conformance/computeinfo/pci_bus_info.cpp b/test_conformance/computeinfo/pci_bus_info.cpp new file mode 100644 index 00000000..cd62ca05 --- /dev/null +++ b/test_conformance/computeinfo/pci_bus_info.cpp @@ -0,0 +1,53 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "harness/compat.h" + +#include <array> +#include <bitset> + +#include "harness/testHarness.h" +#include "harness/deviceInfo.h" + +int test_pci_bus_info(cl_device_id deviceID, cl_context context, + cl_command_queue ignoreQueue, int num_elements) +{ + if (!is_extension_available(deviceID, "cl_khr_pci_bus_info")) + { + log_info("cl_khr_pci_bus_info not supported. Skipping test...\n"); + return TEST_SKIPPED_ITSELF; + } + + cl_int error; + + cl_device_pci_bus_info_khr info; + + size_t size_ret; + error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, 0, NULL, + &size_ret); + test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR size"); + test_assert_error( + size_ret == sizeof(info), + "Query for CL_DEVICE_PCI_BUS_INFO_KHR returned an unexpected size"); + + error = clGetDeviceInfo(deviceID, CL_DEVICE_PCI_BUS_INFO_KHR, sizeof(info), + &info, NULL); + test_error(error, "Unable to query CL_DEVICE_PCI_BUS_INFO_KHR"); + + log_info("\tPCI Bus Info: %04x:%02x:%02x.%x\n", info.pci_domain, + info.pci_bus, info.pci_device, info.pci_function); + + return TEST_PASS; +} diff --git a/test_conformance/contractions/contractions.cpp b/test_conformance/contractions/contractions.cpp index dddebb40..474fd364 100644 --- a/test_conformance/contractions/contractions.cpp +++ b/test_conformance/contractions/contractions.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -434,7 +434,6 @@ static int ParseArgs( int argc, const char **argv ) gArgCount++; } } - vlog( "\n\nTest binary built %s %s\n", __DATE__, __TIME__ ); PrintArch(); diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp index 32998841..3ee072da 100644 --- a/test_conformance/conversions/basic_test_conversions.cpp +++ b/test_conformance/conversions/basic_test_conversions.cpp @@ -696,7 +696,8 @@ static void int2short( void *out, void *in){ ((cl_short*) out)[0] = ((cl_int*) i static void int2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_int*) in)[0]; } static void int2float( void *out, void *in) { - cl_int l = ((cl_int*) in)[0]; + // Use volatile to prevent optimization by Clang compiler + volatile cl_int l = ((cl_int *)in)[0]; ((float*) out)[0] = (l == 0 ? 0.0f : (float) l); // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0 } static void int2double( void *out, void *in) diff --git a/test_conformance/conversions/fplib.cpp b/test_conformance/conversions/fplib.cpp index e739b9ae..3b19b56d 100644 --- a/test_conformance/conversions/fplib.cpp +++ b/test_conformance/conversions/fplib.cpp @@ -79,7 +79,6 @@ float qcom_s64_2_f32(int64_t data, bool sat, roundingMode rnd) uint32_t mantissa; if (mantShift >= 0){ uint64_t temp = (uint64_t)data >> mantShift; - uint64_t mask = (1 << mantShift) - 1; if ((temp << mantShift) != data) inExact = 1; mantissa = (uint32_t)temp; @@ -124,7 +123,6 @@ float qcom_s64_2_f32(int64_t data, bool sat, roundingMode rnd) uint32_t mantissa; if (mantShift >= 0){ uint64_t temp = (uint64_t)data >> mantShift; - uint64_t mask = (1 << mantShift) - 1; if (temp << mantShift != data) inExact = 1; mantissa = (uint32_t)temp; @@ -183,7 +181,6 @@ float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd) uint32_t mantissa; if (mantShift >= 0){ uint64_t temp = data >> mantShift; - uint64_t mask = (1 << mantShift) - 1; if (temp << mantShift != data) inExact = 1; mantissa = (uint32_t)temp; @@ -209,7 +206,6 @@ float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd) uint32_t mantissa; if (mantShift >= 0){ uint64_t temp = (uint64_t)data >> mantShift; - uint64_t mask = (1 << mantShift) - 1; if (temp << mantShift != data) inExact = 1; mantissa = (uint32_t)temp; diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp index 87b8ead7..2b18b925 100644 --- a/test_conformance/conversions/test_conversions.cpp +++ b/test_conformance/conversions/test_conversions.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -38,6 +38,7 @@ #include <sys/param.h> #endif +#include <sstream> #include <stdarg.h> #include <stdio.h> #include <string.h> @@ -47,6 +48,8 @@ #endif #include <time.h> +#include <algorithm> + #include "Sleep.h" #include "basic_test_conversions.h" @@ -340,7 +343,7 @@ int main (int argc, const char **argv ) static int ParseArgs( int argc, const char **argv ) { int i; - argList = (const char **)calloc( argc - 1, sizeof( char*) ); + argList = (const char **)calloc(argc, sizeof(char *)); argCount = 0; if( NULL == argList && argc > 1 ) @@ -481,8 +484,6 @@ static int ParseArgs( int argc, const char **argv ) vlog( "\n" ); - vlog( "Test binary built %s %s\n", __DATE__, __TIME__ ); - PrintArch(); if( gWimpyMode ) @@ -1003,7 +1004,8 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod uint64_t i; gTestCount++; - size_t blockCount = BUFFER_SIZE / MAX( gTypeSizes[ inType ], gTypeSizes[ outType ] ); + size_t blockCount = + BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]); size_t step = blockCount; uint64_t lastCase = 1ULL << (8*gTypeSizes[ inType ]); cl_event writeInputBuffer = NULL; @@ -1078,7 +1080,7 @@ static int DoTest( cl_device_id device, Type outType, Type inType, SaturationMod fflush(stdout); } - cl_uint count = (uint32_t) MIN( blockCount, lastCase - i ); + cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i); writeInputBufferInfo.count = count; // Crate a user event to represent the status of the reference value computation completion @@ -1556,84 +1558,40 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, cl_program program; char testName[256]; int error = 0; - const char **strings; - size_t stringCount = 0; + + std::ostringstream source; + if (outType == kdouble || inType == kdouble) + source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; // Create the program. This is a bit complicated because we are trying to avoid byte and short stores. if (0 == vectorSize) { + // Create the type names. char inName[32]; char outName[32]; - const char *programSource[] = - { - "", // optional pragma - "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " dest[i] = src[i];\n" - "}\n" - }; - stringCount = sizeof(programSource) / sizeof(programSource[0]); - strings = programSource; - - if (outType == kdouble || inType == kdouble) - programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - - //create the type name strncpy(inName, gTypeNames[inType], sizeof(inName)); strncpy(outName, gTypeNames[outType], sizeof(outName)); sprintf(testName, "test_implicit_%s_%s", outName, inName); - vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], gTypeNames[outType]); + + source << "__kernel void " << testName << "( __global " << inName + << " *src, __global " << outName << " *dest )\n"; + source << "{\n"; + source << " size_t i = get_global_id(0);\n"; + source << " dest[i] = src[i];\n"; + source << "}\n"; + + vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType], + gTypeNames[outType]); fflush(stdout); } else { int vectorSizetmp = vectorSizes[vectorSize]; + // Create the type names. char convertString[128]; char inName[32]; char outName[32]; - const char *programSource[] = - { - "", // optional pragma - "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " dest[i] = ", convertString, "( src[i] );\n" - "}\n" - }; - const char *programSourceV3[] = - { - "", // optional pragma - "__kernel void ", testName, "( __global ", inName, " *src, __global ", outName, " *dest )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0))\n" - " vstore3( ", convertString, "( vload3( i, src)), i, dest );\n" - " else\n" - " {\n" - " ", inName, "3 in;\n" - " ", outName, "3 out;\n" - " if( 0 == (i & 1) )\n" - " in.y = src[3*i+1];\n" - " in.x = src[3*i];\n" - " out = ", convertString, "( in ); \n" - " dest[3*i] = out.x;\n" - " if( 0 == (i & 1) )\n" - " dest[3*i+1] = out.y;\n" - " }\n" - "}\n" - }; - stringCount = 3 == vectorSizetmp ? sizeof(programSourceV3) / sizeof(programSourceV3[0]) : - sizeof(programSource) / sizeof(programSource[0]); - strings = 3 == vectorSizetmp ? programSourceV3 : programSource; - - if (outType == kdouble || inType == kdouble) { - programSource[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - programSourceV3[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - } - - //create the type name switch (vectorSizetmp) { case 1: @@ -1658,8 +1616,40 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, vlog("Building %s( %s ) test\n", convertString, inName); break; } - fflush(stdout); + + if (vectorSizetmp == 3) + { + source << "__kernel void " << testName << "( __global " << inName + << " *src, __global " << outName << " *dest )\n"; + source << "{\n"; + source << " size_t i = get_global_id(0);\n"; + source << " if( i + 1 < get_global_size(0))\n"; + source << " vstore3( " << convertString + << "( vload3( i, src)), i, dest );\n"; + source << " else\n"; + source << " {\n"; + source << " " << inName << "3 in;\n"; + source << " " << outName << "3 out;\n"; + source << " if( 0 == (i & 1) )\n"; + source << " in.y = src[3*i+1];\n"; + source << " in.x = src[3*i];\n"; + source << " out = " << convertString << "( in ); \n"; + source << " dest[3*i] = out.x;\n"; + source << " if( 0 == (i & 1) )\n"; + source << " dest[3*i+1] = out.y;\n"; + source << " }\n"; + source << "}\n"; + } + else + { + source << "__kernel void " << testName << "( __global " << inName + << " *src, __global " << outName << " *dest )\n"; + source << "{\n"; + source << " size_t i = get_global_id(0);\n"; + source << " dest[i] = " << convertString << "( src[i] );\n"; + source << "}\n"; + } } *outKernel = NULL; @@ -1668,11 +1658,12 @@ static cl_program MakeProgram( Type outType, Type inType, SaturationMode sat, flags = "-cl-denorms-are-zero"; // build it - error = create_single_kernel_helper(gContext, &program, outKernel, (cl_uint)stringCount, strings, testName, flags); + std::string sourceString = source.str(); + const char *programSource = sourceString.c_str(); + error = create_single_kernel_helper(gContext, &program, outKernel, 1, + &programSource, testName, flags); if (error) { - char buffer[2048] = ""; - vlog_error("Failed to build kernel/program.\n", error); clReleaseProgram(program); return NULL; diff --git a/test_conformance/device_execution/enqueue_ndrange.cpp b/test_conformance/device_execution/enqueue_ndrange.cpp index 8ced6629..f228f063 100644 --- a/test_conformance/device_execution/enqueue_ndrange.cpp +++ b/test_conformance/device_execution/enqueue_ndrange.cpp @@ -18,6 +18,7 @@ #include "harness/testHarness.h" #include "harness/typeWrappers.h" +#include <algorithm> #include <vector> #include "procs.h" @@ -645,7 +646,7 @@ int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_que max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size; if(gWimpyMode) { - max_local_size = MIN(8, max_local_size); + max_local_size = std::min((size_t)8, max_local_size); } cl_uint num = 10; diff --git a/test_conformance/device_execution/host_queue_order.cpp b/test_conformance/device_execution/host_queue_order.cpp index 2b5688d1..5376ea40 100644 --- a/test_conformance/device_execution/host_queue_order.cpp +++ b/test_conformance/device_execution/host_queue_order.cpp @@ -18,6 +18,7 @@ #include "harness/testHarness.h" #include "harness/typeWrappers.h" +#include <algorithm> #include <vector> #include "procs.h" @@ -124,7 +125,7 @@ int test_host_queue_order(cl_device_id device, cl_context context, cl_command_qu cl_uint num = arr_size(result); if( gWimpyMode ) { - num = MAX(num / 16, 4); + num = std::max(num / 16, 4U); } clMemWrapper res_mem; diff --git a/test_conformance/events/action_classes.cpp b/test_conformance/events/action_classes.cpp index d70d76bd..a84be6b6 100644 --- a/test_conformance/events/action_classes.cpp +++ b/test_conformance/events/action_classes.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -19,7 +19,8 @@ const cl_uint BufferSizeReductionFactor = 20; -cl_int Action::IGetPreferredImageSize2D( cl_device_id device, size_t &outWidth, size_t &outHeight ) +cl_int Action::IGetPreferredImageSize2D(cl_device_id device, size_t &outWidth, + size_t &outHeight) { cl_ulong maxAllocSize; size_t maxWidth, maxHeight; @@ -27,23 +28,27 @@ cl_int Action::IGetPreferredImageSize2D( cl_device_id device, size_t &outWidt // Get the largest possible buffer we could allocate - error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL ); - error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof( maxWidth ), &maxWidth, NULL ); - error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof( maxHeight ), &maxHeight, NULL ); - test_error( error, "Unable to get device config" ); + error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(maxWidth), &maxWidth, NULL); + error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(maxHeight), &maxHeight, NULL); + test_error(error, "Unable to get device config"); // Create something of a decent size - if( maxWidth * maxHeight * 4 > maxAllocSize / BufferSizeReductionFactor ) + if (maxWidth * maxHeight * 4 > maxAllocSize / BufferSizeReductionFactor) { - float rootSize = sqrtf( (float)( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) ); + float rootSize = + sqrtf((float)(maxAllocSize / (BufferSizeReductionFactor * 4))); - if( (size_t)rootSize > maxWidth ) + if ((size_t)rootSize > maxWidth) outWidth = maxWidth; else outWidth = (size_t)rootSize; - outHeight = (size_t)( ( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) / outWidth ); - if( outHeight > maxHeight ) - outHeight = maxHeight; + outHeight = (size_t)((maxAllocSize / (BufferSizeReductionFactor * 4)) + / outWidth); + if (outHeight > maxHeight) outHeight = maxHeight; } else { @@ -51,19 +56,18 @@ cl_int Action::IGetPreferredImageSize2D( cl_device_id device, size_t &outWidt outHeight = maxHeight; } - outWidth /=2; - outHeight /=2; + outWidth /= 2; + outHeight /= 2; - if (outWidth > 2048) - outWidth = 2048; - if (outHeight > 2048) - outHeight = 2048; + if (outWidth > 2048) outWidth = 2048; + if (outHeight > 2048) outHeight = 2048; log_info("\tImage size: %d x %d (%gMB)\n", (int)outWidth, (int)outHeight, - (double)((int)outWidth*(int)outHeight*4)/(1024.0*1024.0)); + (double)((int)outWidth * (int)outHeight * 4) / (1024.0 * 1024.0)); return CL_SUCCESS; } -cl_int Action::IGetPreferredImageSize3D( cl_device_id device, size_t &outWidth, size_t &outHeight, size_t &outDepth ) +cl_int Action::IGetPreferredImageSize3D(cl_device_id device, size_t &outWidth, + size_t &outHeight, size_t &outDepth) { cl_ulong maxAllocSize; size_t maxWidth, maxHeight, maxDepth; @@ -71,28 +75,34 @@ cl_int Action::IGetPreferredImageSize3D( cl_device_id device, size_t &outWidt // Get the largest possible buffer we could allocate - error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL ); - error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof( maxWidth ), &maxWidth, NULL ); - error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof( maxHeight ), &maxHeight, NULL ); - error |= clGetDeviceInfo( device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof( maxDepth ), &maxDepth, NULL ); - test_error( error, "Unable to get device config" ); + error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); + error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, + sizeof(maxWidth), &maxWidth, NULL); + error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, + sizeof(maxHeight), &maxHeight, NULL); + error |= clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, + sizeof(maxDepth), &maxDepth, NULL); + test_error(error, "Unable to get device config"); // Create something of a decent size - if( (cl_ulong)maxWidth * maxHeight * maxDepth > maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) + if ((cl_ulong)maxWidth * maxHeight * maxDepth + > maxAllocSize / (BufferSizeReductionFactor * 4)) { - float rootSize = cbrtf( (float)( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) ); + float rootSize = + cbrtf((float)(maxAllocSize / (BufferSizeReductionFactor * 4))); - if( (size_t)rootSize > maxWidth ) + if ((size_t)rootSize > maxWidth) outWidth = maxWidth; else outWidth = (size_t)rootSize; - if( (size_t)rootSize > maxHeight ) + if ((size_t)rootSize > maxHeight) outHeight = maxHeight; else outHeight = (size_t)rootSize; - outDepth = (size_t)( ( maxAllocSize / ( BufferSizeReductionFactor * 4 ) ) / ( outWidth * outHeight ) ); - if( outDepth > maxDepth ) - outDepth = maxDepth; + outDepth = (size_t)((maxAllocSize / (BufferSizeReductionFactor * 4)) + / (outWidth * outHeight)); + if (outDepth > maxDepth) outDepth = maxDepth; } else { @@ -101,25 +111,25 @@ cl_int Action::IGetPreferredImageSize3D( cl_device_id device, size_t &outWidt outDepth = maxDepth; } - outWidth /=2; - outHeight /=2; - outDepth /=2; + outWidth /= 2; + outHeight /= 2; + outDepth /= 2; - if (outWidth > 512) - outWidth = 512; - if (outHeight > 512) - outHeight = 512; - if (outDepth > 512) - outDepth = 512; - log_info("\tImage size: %d x %d x %d (%gMB)\n", (int)outWidth, (int)outHeight, (int)outDepth, - (double)((int)outWidth*(int)outHeight*(int)outDepth*4)/(1024.0*1024.0)); + if (outWidth > 512) outWidth = 512; + if (outHeight > 512) outHeight = 512; + if (outDepth > 512) outDepth = 512; + log_info("\tImage size: %d x %d x %d (%gMB)\n", (int)outWidth, + (int)outHeight, (int)outDepth, + (double)((int)outWidth * (int)outHeight * (int)outDepth * 4) + / (1024.0 * 1024.0)); return CL_SUCCESS; } #pragma mark -------------------- Execution Sub-Classes ------------------------- -cl_int NDRangeKernelAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int NDRangeKernelAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { const char *long_kernel[] = { "__kernel void sample_test(__global float *src, __global int *dst)\n" @@ -132,101 +142,116 @@ cl_int NDRangeKernelAction::Setup( cl_device_id device, cl_context context, cl_c " dst[tid] = (int)src[tid] * 3;\n" " }\n" "\n" - "}\n" }; + "}\n" + }; size_t threads[1] = { 1000 }; int error; - if( create_single_kernel_helper( context, &mProgram, &mKernel, 1, long_kernel, "sample_test" ) ) + if (create_single_kernel_helper(context, &mProgram, &mKernel, 1, + long_kernel, "sample_test")) { return -1; } - error = get_max_common_work_group_size( context, mKernel, threads[0], &mLocalThreads[0] ); - test_error( error, "Unable to get work group size to use" ); + error = get_max_common_work_group_size(context, mKernel, threads[0], + &mLocalThreads[0]); + test_error(error, "Unable to get work group size to use"); mStreams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1000, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); mStreams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * 1000, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); /* Set the arguments */ - error = clSetKernelArg( mKernel, 0, sizeof( mStreams[0] ), &mStreams[0] ); - test_error( error, "Unable to set kernel arguments" ); - error = clSetKernelArg( mKernel, 1, sizeof( mStreams[1] ), &mStreams[1] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(mKernel, 0, sizeof(mStreams[0]), &mStreams[0]); + test_error(error, "Unable to set kernel arguments"); + error = clSetKernelArg(mKernel, 1, sizeof(mStreams[1]), &mStreams[1]); + test_error(error, "Unable to set kernel arguments"); return CL_SUCCESS; } -cl_int NDRangeKernelAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int NDRangeKernelAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { size_t threads[1] = { 1000 }; - cl_int error = clEnqueueNDRangeKernel( queue, mKernel, 1, NULL, threads, mLocalThreads, numWaits, waits, outEvent ); - test_error( error, "Unable to execute kernel" ); + cl_int error = + clEnqueueNDRangeKernel(queue, mKernel, 1, NULL, threads, mLocalThreads, + numWaits, waits, outEvent); + test_error(error, "Unable to execute kernel"); return CL_SUCCESS; } #pragma mark -------------------- Buffer Sub-Classes ------------------------- -cl_int BufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue, bool allocate ) +cl_int BufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue, bool allocate) { cl_int error; cl_ulong maxAllocSize; // Get the largest possible buffer we could allocate - error = clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL ); + error = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(maxAllocSize), &maxAllocSize, NULL); - // Don't create a buffer quite that big, just so we have some space left over for other work - mSize = (size_t)( maxAllocSize / BufferSizeReductionFactor ); + // Don't create a buffer quite that big, just so we have some space left + // over for other work + mSize = (size_t)(maxAllocSize / BufferSizeReductionFactor); // Cap at 128M so tests complete in a reasonable amount of time. - if (mSize > 128 << 20) - mSize = 128 << 20; + if (mSize > 128 << 20) mSize = 128 << 20; - mSize /=2; + mSize /= 2; - log_info("\tBuffer size: %gMB\n", (double)mSize/(1024.0*1024.0)); + log_info("\tBuffer size: %gMB\n", (double)mSize / (1024.0 * 1024.0)); - mBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, mSize, NULL, &error ); - test_error( error, "Unable to create buffer to test against" ); + mBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + mSize, NULL, &error); + test_error(error, "Unable to create buffer to test against"); - mOutBuffer = malloc( mSize ); - if( mOutBuffer == NULL ) + mOutBuffer = malloc(mSize); + if (mOutBuffer == NULL) { - log_error( "ERROR: Unable to allocate temp buffer (out of memory)\n" ); + log_error("ERROR: Unable to allocate temp buffer (out of memory)\n"); return CL_OUT_OF_RESOURCES; } return CL_SUCCESS; } -cl_int ReadBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int ReadBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { - return BufferAction::Setup( device, context, queue, true ); + return BufferAction::Setup(device, context, queue, true); } -cl_int ReadBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int ReadBufferAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - cl_int error = clEnqueueReadBuffer( queue, mBuffer, CL_FALSE, 0, mSize, mOutBuffer, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue buffer read" ); + cl_int error = clEnqueueReadBuffer(queue, mBuffer, CL_FALSE, 0, mSize, + mOutBuffer, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue buffer read"); return CL_SUCCESS; } -cl_int WriteBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int WriteBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { - return BufferAction::Setup( device, context, queue, true ); + return BufferAction::Setup(device, context, queue, true); } -cl_int WriteBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int WriteBufferAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - cl_int error = clEnqueueWriteBuffer( queue, mBuffer, CL_FALSE, 0, mSize, mOutBuffer, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue buffer write" ); + cl_int error = clEnqueueWriteBuffer(queue, mBuffer, CL_FALSE, 0, mSize, + mOutBuffer, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue buffer write"); return CL_SUCCESS; } @@ -234,40 +259,46 @@ cl_int WriteBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_ MapBufferAction::~MapBufferAction() { if (mQueue) - clEnqueueUnmapMemObject( mQueue, mBuffer, mMappedPtr, 0, NULL, NULL ); + clEnqueueUnmapMemObject(mQueue, mBuffer, mMappedPtr, 0, NULL, NULL); } -cl_int MapBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int MapBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { - return BufferAction::Setup( device, context, queue, false ); + return BufferAction::Setup(device, context, queue, false); } -cl_int MapBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int MapBufferAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { cl_int error; mQueue = queue; - mMappedPtr = clEnqueueMapBuffer( queue, mBuffer, CL_FALSE, CL_MAP_READ, 0, mSize, numWaits, waits, outEvent, &error ); - test_error( error, "Unable to enqueue buffer map" ); + mMappedPtr = clEnqueueMapBuffer(queue, mBuffer, CL_FALSE, CL_MAP_READ, 0, + mSize, numWaits, waits, outEvent, &error); + test_error(error, "Unable to enqueue buffer map"); return CL_SUCCESS; } -cl_int UnmapBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int UnmapBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { - cl_int error = BufferAction::Setup( device, context, queue, false ); - if( error != CL_SUCCESS ) - return error; + cl_int error = BufferAction::Setup(device, context, queue, false); + if (error != CL_SUCCESS) return error; - mMappedPtr = clEnqueueMapBuffer( queue, mBuffer, CL_TRUE, CL_MAP_READ, 0, mSize, 0, NULL, NULL, &error ); - test_error( error, "Unable to enqueue buffer map" ); + mMappedPtr = clEnqueueMapBuffer(queue, mBuffer, CL_TRUE, CL_MAP_READ, 0, + mSize, 0, NULL, NULL, &error); + test_error(error, "Unable to enqueue buffer map"); return CL_SUCCESS; } -cl_int UnmapBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int UnmapBufferAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - cl_int error = clEnqueueUnmapMemObject( queue, mBuffer, mMappedPtr, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue buffer unmap" ); + cl_int error = clEnqueueUnmapMemObject(queue, mBuffer, mMappedPtr, numWaits, + waits, outEvent); + test_error(error, "Unable to enqueue buffer unmap"); return CL_SUCCESS; } @@ -275,349 +306,410 @@ cl_int UnmapBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_ #pragma mark -------------------- Read/Write Image Classes ------------------------- -cl_int ReadImage2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int ReadImage2DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); + mImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); - test_error( error, "Unable to create image to test against" ); + test_error(error, "Unable to create image to test against"); - mOutput = malloc( mWidth * mHeight * 4 ); - if( mOutput == NULL ) + mOutput = malloc(mWidth * mHeight * 4); + if (mOutput == NULL) { - log_error( "ERROR: Unable to allocate buffer: out of memory\n" ); + log_error("ERROR: Unable to allocate buffer: out of memory\n"); return CL_OUT_OF_RESOURCES; } return CL_SUCCESS; } -cl_int ReadImage2DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int ReadImage2DAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 }; - cl_int error = clEnqueueReadImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image read" ); + cl_int error = clEnqueueReadImage(queue, mImage, CL_FALSE, origin, region, + 0, 0, mOutput, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image read"); return CL_SUCCESS; } -cl_int ReadImage3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int ReadImage3DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mOutput = malloc( mWidth * mHeight * mDepth * 4 ); - if( mOutput == NULL ) + mOutput = malloc(mWidth * mHeight * mDepth * 4); + if (mOutput == NULL) { - log_error( "ERROR: Unable to allocate buffer: out of memory\n" ); + log_error("ERROR: Unable to allocate buffer: out of memory\n"); return CL_OUT_OF_RESOURCES; } return CL_SUCCESS; } -cl_int ReadImage3DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int ReadImage3DAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth }; - cl_int error = clEnqueueReadImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image read" ); + cl_int error = clEnqueueReadImage(queue, mImage, CL_FALSE, origin, region, + 0, 0, mOutput, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image read"); return CL_SUCCESS; } -cl_int WriteImage2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int WriteImage2DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mOutput = malloc( mWidth * mHeight * 4 ); - if( mOutput == NULL ) + mOutput = malloc(mWidth * mHeight * 4); + if (mOutput == NULL) { - log_error( "ERROR: Unable to allocate buffer: out of memory\n" ); + log_error("ERROR: Unable to allocate buffer: out of memory\n"); return CL_OUT_OF_RESOURCES; } return CL_SUCCESS; } -cl_int WriteImage2DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int WriteImage2DAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 }; - cl_int error = clEnqueueWriteImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image write" ); + cl_int error = + clEnqueueWriteImage(queue, mImage, CL_FALSE, origin, region, 0, 0, + mOutput, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image write"); return CL_SUCCESS; } -cl_int WriteImage3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int WriteImage3DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mOutput = malloc( mWidth * mHeight * mDepth * 4 ); - if( mOutput == NULL ) + mOutput = malloc(mWidth * mHeight * mDepth * 4); + if (mOutput == NULL) { - log_error( "ERROR: Unable to allocate buffer: out of memory\n" ); + log_error("ERROR: Unable to allocate buffer: out of memory\n"); return CL_OUT_OF_RESOURCES; } return CL_SUCCESS; } -cl_int WriteImage3DAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int WriteImage3DAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth }; - cl_int error = clEnqueueWriteImage( queue, mImage, CL_FALSE, origin, region, 0, 0, mOutput, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image write" ); + cl_int error = + clEnqueueWriteImage(queue, mImage, CL_FALSE, origin, region, 0, 0, + mOutput, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image write"); return CL_SUCCESS; } #pragma mark -------------------- Copy Image Classes ------------------------- -cl_int CopyImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int CopyImageAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth }; - cl_int error = clEnqueueCopyImage( queue, mSrcImage, mDstImage, origin, origin, region, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image copy" ); + cl_int error = + clEnqueueCopyImage(queue, mSrcImage, mDstImage, origin, origin, region, + numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image copy"); return CL_SUCCESS; } -cl_int CopyImage2Dto2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyImage2Dto2DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; mWidth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); mDepth = 1; return CL_SUCCESS; } -cl_int CopyImage2Dto3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyImage2Dto3DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; mDepth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); mDepth = 1; return CL_SUCCESS; } -cl_int CopyImage3Dto2DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyImage3Dto2DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; mDepth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); mDepth = 1; return CL_SUCCESS; } -cl_int CopyImage3Dto3DAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyImage3Dto3DAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; mDepth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); return CL_SUCCESS; } #pragma mark -------------------- Copy Image/Buffer Classes ------------------------- -cl_int Copy2DImageToBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int Copy2DImageToBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; mWidth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_2d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_2d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, mWidth * mHeight * 4, NULL, &error ); - test_error( error, "Unable to create buffer to test against" ); + mDstBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + mWidth * mHeight * 4, NULL, &error); + test_error(error, "Unable to create buffer to test against"); return CL_SUCCESS; } -cl_int Copy2DImageToBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int Copy2DImageToBufferAction::Execute(cl_command_queue queue, + cl_uint numWaits, cl_event *waits, + cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 }; - cl_int error = clEnqueueCopyImageToBuffer( queue, mSrcImage, mDstBuffer, origin, region, 0, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image to buffer copy" ); + cl_int error = + clEnqueueCopyImageToBuffer(queue, mSrcImage, mDstBuffer, origin, region, + 0, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image to buffer copy"); return CL_SUCCESS; } -cl_int Copy3DImageToBufferAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int Copy3DImageToBufferAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; mDepth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mSrcImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); - mDstBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, mWidth * mHeight * mDepth * 4, NULL, &error ); - test_error( error, "Unable to create buffer to test against" ); + mDstBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + mWidth * mHeight * mDepth * 4, NULL, &error); + test_error(error, "Unable to create buffer to test against"); return CL_SUCCESS; } -cl_int Copy3DImageToBufferAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int Copy3DImageToBufferAction::Execute(cl_command_queue queue, + cl_uint numWaits, cl_event *waits, + cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth }; - cl_int error = clEnqueueCopyImageToBuffer( queue, mSrcImage, mDstBuffer, origin, region, 0, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue image to buffer copy" ); + cl_int error = + clEnqueueCopyImageToBuffer(queue, mSrcImage, mDstBuffer, origin, region, + 0, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue image to buffer copy"); return CL_SUCCESS; } -cl_int CopyBufferTo2DImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyBufferTo2DImageAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; mWidth /= 2; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mSrcBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, mWidth * mHeight * 4, NULL, &error ); - test_error( error, "Unable to create buffer to test against" ); + mSrcBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, mWidth * mHeight * 4, + NULL, &error); + test_error(error, "Unable to create buffer to test against"); - mDstImage = create_image_2d( context, CL_MEM_WRITE_ONLY, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_2d(context, CL_MEM_WRITE_ONLY, &format, mWidth, + mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); return CL_SUCCESS; } -cl_int CopyBufferTo2DImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int CopyBufferTo2DImageAction::Execute(cl_command_queue queue, + cl_uint numWaits, cl_event *waits, + cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 }; - cl_int error = clEnqueueCopyBufferToImage( queue, mSrcBuffer, mDstImage, 0, origin, region, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue buffer to image copy" ); + cl_int error = + clEnqueueCopyBufferToImage(queue, mSrcBuffer, mDstImage, 0, origin, + region, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue buffer to image copy"); return CL_SUCCESS; } -cl_int CopyBufferTo3DImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int CopyBufferTo3DImageAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize3D( device, mWidth, mHeight, mDepth ) ) ) + if ((error = IGetPreferredImageSize3D(device, mWidth, mHeight, mDepth))) return error; mDepth /= 2; - mSrcBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY, mWidth * mHeight * mDepth * 4, NULL, &error ); - test_error( error, "Unable to create buffer to test against" ); + mSrcBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY, + mWidth * mHeight * mDepth * 4, NULL, &error); + test_error(error, "Unable to create buffer to test against"); cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mDstImage = create_image_3d( context, CL_MEM_READ_ONLY, &format, mWidth, mHeight, mDepth, 0, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mDstImage = create_image_3d(context, CL_MEM_READ_ONLY, &format, mWidth, + mHeight, mDepth, 0, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); return CL_SUCCESS; } -cl_int CopyBufferTo3DImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int CopyBufferTo3DImageAction::Execute(cl_command_queue queue, + cl_uint numWaits, cl_event *waits, + cl_event *outEvent) { - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, mDepth }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, mDepth }; - cl_int error = clEnqueueCopyBufferToImage( queue, mSrcBuffer, mDstImage, 0, origin, region, numWaits, waits, outEvent ); - test_error( error, "Unable to enqueue buffer to image copy" ); + cl_int error = + clEnqueueCopyBufferToImage(queue, mSrcBuffer, mDstImage, 0, origin, + region, numWaits, waits, outEvent); + test_error(error, "Unable to enqueue buffer to image copy"); return CL_SUCCESS; } @@ -627,34 +719,39 @@ cl_int CopyBufferTo3DImageAction::Execute( cl_command_queue queue, cl_uint numWa MapImageAction::~MapImageAction() { if (mQueue) - clEnqueueUnmapMemObject( mQueue, mImage, mMappedPtr, 0, NULL, NULL ); + clEnqueueUnmapMemObject(mQueue, mImage, mMappedPtr, 0, NULL, NULL); } -cl_int MapImageAction::Setup( cl_device_id device, cl_context context, cl_command_queue queue ) +cl_int MapImageAction::Setup(cl_device_id device, cl_context context, + cl_command_queue queue) { cl_int error; - if( ( error = IGetPreferredImageSize2D( device, mWidth, mHeight ) ) ) + if ((error = IGetPreferredImageSize2D(device, mWidth, mHeight))) return error; cl_image_format format = { CL_RGBA, CL_SIGNED_INT8 }; - mImage = create_image_2d( context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, &format, mWidth, mHeight, 0, NULL, &error ); - test_error( error, "Unable to create image to test against" ); + mImage = create_image_2d(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + &format, mWidth, mHeight, 0, NULL, &error); + test_error(error, "Unable to create image to test against"); return CL_SUCCESS; } -cl_int MapImageAction::Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) +cl_int MapImageAction::Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) { cl_int error; - size_t origin[ 3 ] = { 0, 0, 0 }, region[ 3 ] = { mWidth, mHeight, 1 }; + size_t origin[3] = { 0, 0, 0 }, region[3] = { mWidth, mHeight, 1 }; size_t outPitch; mQueue = queue; - mMappedPtr = clEnqueueMapImage( queue, mImage, CL_FALSE, CL_MAP_READ, origin, region, &outPitch, NULL, numWaits, waits, outEvent, &error ); - test_error( error, "Unable to enqueue image map" ); + mMappedPtr = + clEnqueueMapImage(queue, mImage, CL_FALSE, CL_MAP_READ, origin, region, + &outPitch, NULL, numWaits, waits, outEvent, &error); + test_error(error, "Unable to enqueue image map"); return CL_SUCCESS; } diff --git a/test_conformance/events/action_classes.h b/test_conformance/events/action_classes.h index 069ed346..e528f11a 100644 --- a/test_conformance/events/action_classes.h +++ b/test_conformance/events/action_classes.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -23,303 +23,319 @@ // it would potentially be possible for an implementation to make actions // wait on one another based on their shared I/O, not because of their // wait lists! -class Action -{ - public: - Action() {} - virtual ~Action() {} - - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ) = 0; - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ) = 0; - - virtual const char * GetName( void ) const = 0; - - protected: - - cl_int IGetPreferredImageSize2D( cl_device_id device, size_t &outWidth, size_t &outHeight ); - cl_int IGetPreferredImageSize3D( cl_device_id device, size_t &outWidth, size_t &outHeight, size_t &outDepth ); +class Action { +public: + Action() {} + virtual ~Action() {} + + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue) = 0; + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent) = 0; + + virtual const char *GetName(void) const = 0; + +protected: + cl_int IGetPreferredImageSize2D(cl_device_id device, size_t &outWidth, + size_t &outHeight); + cl_int IGetPreferredImageSize3D(cl_device_id device, size_t &outWidth, + size_t &outHeight, size_t &outDepth); }; // Simple NDRangeKernel execution that takes a noticable amount of time -class NDRangeKernelAction : public Action -{ - public: - NDRangeKernelAction() {} - virtual ~NDRangeKernelAction() {} - - size_t mLocalThreads[ 1 ]; - clMemWrapper mStreams[ 2 ]; - clProgramWrapper mProgram; - clKernelWrapper mKernel; - - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); - - virtual const char * GetName( void ) const { return "NDRangeKernel"; } +class NDRangeKernelAction : public Action { +public: + NDRangeKernelAction() {} + virtual ~NDRangeKernelAction() {} + + size_t mLocalThreads[1]; + clMemWrapper mStreams[2]; + clProgramWrapper mProgram; + clKernelWrapper mKernel; + + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); + + virtual const char *GetName(void) const { return "NDRangeKernel"; } }; // Base action for buffer actions -class BufferAction : public Action -{ - public: - clMemWrapper mBuffer; - size_t mSize; - void *mOutBuffer; +class BufferAction : public Action { +public: + clMemWrapper mBuffer; + size_t mSize; + void *mOutBuffer; - BufferAction() { mOutBuffer = NULL; } - virtual ~BufferAction() { free( mOutBuffer ); } + BufferAction() { mOutBuffer = NULL; } + virtual ~BufferAction() { free(mOutBuffer); } - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue, bool allocate ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue, bool allocate); }; -class ReadBufferAction : public BufferAction -{ - public: - ReadBufferAction() {} - virtual ~ReadBufferAction() {} +class ReadBufferAction : public BufferAction { +public: + ReadBufferAction() {} + virtual ~ReadBufferAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "ReadBuffer"; } + virtual const char *GetName(void) const { return "ReadBuffer"; } }; -class WriteBufferAction : public BufferAction -{ - public: - WriteBufferAction() {} - virtual ~WriteBufferAction() {} +class WriteBufferAction : public BufferAction { +public: + WriteBufferAction() {} + virtual ~WriteBufferAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "WriteBuffer"; } + virtual const char *GetName(void) const { return "WriteBuffer"; } }; -class MapBufferAction : public BufferAction -{ - public: - MapBufferAction() : mQueue(0) {} +class MapBufferAction : public BufferAction { +public: + MapBufferAction(): mQueue(0) {} - cl_command_queue mQueue; - void *mMappedPtr; + cl_command_queue mQueue; + void *mMappedPtr; - virtual ~MapBufferAction(); - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual ~MapBufferAction(); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "MapBuffer"; } + virtual const char *GetName(void) const { return "MapBuffer"; } }; -class UnmapBufferAction : public BufferAction -{ - public: - UnmapBufferAction() {} - virtual ~UnmapBufferAction() {} +class UnmapBufferAction : public BufferAction { +public: + UnmapBufferAction() {} + virtual ~UnmapBufferAction() {} - void *mMappedPtr; + void *mMappedPtr; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "UnmapBuffer"; } + virtual const char *GetName(void) const { return "UnmapBuffer"; } }; -class ReadImage2DAction : public Action -{ - public: - ReadImage2DAction() { mOutput = NULL; } - virtual ~ReadImage2DAction() { free( mOutput ); } +class ReadImage2DAction : public Action { +public: + ReadImage2DAction() { mOutput = NULL; } + virtual ~ReadImage2DAction() { free(mOutput); } - clMemWrapper mImage; - size_t mWidth, mHeight; - void *mOutput; + clMemWrapper mImage; + size_t mWidth, mHeight; + void *mOutput; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "ReadImage2D"; } + virtual const char *GetName(void) const { return "ReadImage2D"; } }; -class ReadImage3DAction : public Action -{ - public: - ReadImage3DAction() { mOutput = NULL; } - virtual ~ReadImage3DAction() { free( mOutput ); } +class ReadImage3DAction : public Action { +public: + ReadImage3DAction() { mOutput = NULL; } + virtual ~ReadImage3DAction() { free(mOutput); } - clMemWrapper mImage; - size_t mWidth, mHeight, mDepth; - void *mOutput; + clMemWrapper mImage; + size_t mWidth, mHeight, mDepth; + void *mOutput; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "ReadImage3D"; } + virtual const char *GetName(void) const { return "ReadImage3D"; } }; -class WriteImage2DAction : public Action -{ - public: - clMemWrapper mImage; - size_t mWidth, mHeight; - void *mOutput; +class WriteImage2DAction : public Action { +public: + clMemWrapper mImage; + size_t mWidth, mHeight; + void *mOutput; - WriteImage2DAction() { mOutput = NULL; } - virtual ~WriteImage2DAction() { free( mOutput ); } + WriteImage2DAction() { mOutput = NULL; } + virtual ~WriteImage2DAction() { free(mOutput); } - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "WriteImage2D"; } + virtual const char *GetName(void) const { return "WriteImage2D"; } }; -class WriteImage3DAction : public Action -{ - public: - clMemWrapper mImage; - size_t mWidth, mHeight, mDepth; - void *mOutput; +class WriteImage3DAction : public Action { +public: + clMemWrapper mImage; + size_t mWidth, mHeight, mDepth; + void *mOutput; - WriteImage3DAction() { mOutput = NULL; } - virtual ~WriteImage3DAction() { free( mOutput ); } + WriteImage3DAction() { mOutput = NULL; } + virtual ~WriteImage3DAction() { free(mOutput); } - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "WriteImage3D"; } + virtual const char *GetName(void) const { return "WriteImage3D"; } }; -class CopyImageAction : public Action -{ - public: - CopyImageAction() {} - virtual ~CopyImageAction() {} +class CopyImageAction : public Action { +public: + CopyImageAction() {} + virtual ~CopyImageAction() {} - clMemWrapper mSrcImage, mDstImage; - size_t mWidth, mHeight, mDepth; + clMemWrapper mSrcImage, mDstImage; + size_t mWidth, mHeight, mDepth; - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); }; -class CopyImage2Dto2DAction : public CopyImageAction -{ - public: - CopyImage2Dto2DAction() {} - virtual ~CopyImage2Dto2DAction() {} +class CopyImage2Dto2DAction : public CopyImageAction { +public: + CopyImage2Dto2DAction() {} + virtual ~CopyImage2Dto2DAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); - virtual const char * GetName( void ) const { return "CopyImage2Dto2D"; } + virtual const char *GetName(void) const { return "CopyImage2Dto2D"; } }; -class CopyImage2Dto3DAction : public CopyImageAction -{ - public: - CopyImage2Dto3DAction() {} - virtual ~CopyImage2Dto3DAction() {} +class CopyImage2Dto3DAction : public CopyImageAction { +public: + CopyImage2Dto3DAction() {} + virtual ~CopyImage2Dto3DAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); - virtual const char * GetName( void ) const { return "CopyImage2Dto3D"; } + virtual const char *GetName(void) const { return "CopyImage2Dto3D"; } }; -class CopyImage3Dto2DAction : public CopyImageAction -{ - public: - CopyImage3Dto2DAction() {} - virtual ~CopyImage3Dto2DAction() {} +class CopyImage3Dto2DAction : public CopyImageAction { +public: + CopyImage3Dto2DAction() {} + virtual ~CopyImage3Dto2DAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); - virtual const char * GetName( void ) const { return "CopyImage3Dto2D"; } + virtual const char *GetName(void) const { return "CopyImage3Dto2D"; } }; -class CopyImage3Dto3DAction : public CopyImageAction -{ - public: - CopyImage3Dto3DAction() {} - virtual ~CopyImage3Dto3DAction() {} +class CopyImage3Dto3DAction : public CopyImageAction { +public: + CopyImage3Dto3DAction() {} + virtual ~CopyImage3Dto3DAction() {} - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); - virtual const char * GetName( void ) const { return "CopyImage3Dto3D"; } + virtual const char *GetName(void) const { return "CopyImage3Dto3D"; } }; -class Copy2DImageToBufferAction : public Action -{ - public: - Copy2DImageToBufferAction() {} - virtual ~Copy2DImageToBufferAction() {} +class Copy2DImageToBufferAction : public Action { +public: + Copy2DImageToBufferAction() {} + virtual ~Copy2DImageToBufferAction() {} - clMemWrapper mSrcImage, mDstBuffer; - size_t mWidth, mHeight; + clMemWrapper mSrcImage, mDstBuffer; + size_t mWidth, mHeight; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "Copy2DImageToBuffer"; } + virtual const char *GetName(void) const { return "Copy2DImageToBuffer"; } }; -class Copy3DImageToBufferAction : public Action -{ - public: - Copy3DImageToBufferAction() {} - virtual ~Copy3DImageToBufferAction() {} +class Copy3DImageToBufferAction : public Action { +public: + Copy3DImageToBufferAction() {} + virtual ~Copy3DImageToBufferAction() {} - clMemWrapper mSrcImage, mDstBuffer; - size_t mWidth, mHeight, mDepth; + clMemWrapper mSrcImage, mDstBuffer; + size_t mWidth, mHeight, mDepth; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "Copy3DImageToBuffer"; } + virtual const char *GetName(void) const { return "Copy3DImageToBuffer"; } }; -class CopyBufferTo2DImageAction : public Action -{ - public: - CopyBufferTo2DImageAction() {} - virtual ~CopyBufferTo2DImageAction() {} +class CopyBufferTo2DImageAction : public Action { +public: + CopyBufferTo2DImageAction() {} + virtual ~CopyBufferTo2DImageAction() {} - clMemWrapper mSrcBuffer, mDstImage; - size_t mWidth, mHeight; + clMemWrapper mSrcBuffer, mDstImage; + size_t mWidth, mHeight; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "CopyBufferTo2D"; } + virtual const char *GetName(void) const { return "CopyBufferTo2D"; } }; -class CopyBufferTo3DImageAction : public Action -{ - public: - CopyBufferTo3DImageAction() {} - virtual ~CopyBufferTo3DImageAction() {} +class CopyBufferTo3DImageAction : public Action { +public: + CopyBufferTo3DImageAction() {} + virtual ~CopyBufferTo3DImageAction() {} - clMemWrapper mSrcBuffer, mDstImage; - size_t mWidth, mHeight, mDepth; + clMemWrapper mSrcBuffer, mDstImage; + size_t mWidth, mHeight, mDepth; - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "CopyBufferTo3D"; } + virtual const char *GetName(void) const { return "CopyBufferTo3D"; } }; -class MapImageAction : public Action -{ - public: - MapImageAction() : mQueue(0) {} +class MapImageAction : public Action { +public: + MapImageAction(): mQueue(0) {} - clMemWrapper mImage; - size_t mWidth, mHeight; - void *mMappedPtr; - cl_command_queue mQueue; + clMemWrapper mImage; + size_t mWidth, mHeight; + void *mMappedPtr; + cl_command_queue mQueue; - virtual ~MapImageAction(); - virtual cl_int Setup( cl_device_id device, cl_context context, cl_command_queue queue ); - virtual cl_int Execute( cl_command_queue queue, cl_uint numWaits, cl_event *waits, cl_event *outEvent ); + virtual ~MapImageAction(); + virtual cl_int Setup(cl_device_id device, cl_context context, + cl_command_queue queue); + virtual cl_int Execute(cl_command_queue queue, cl_uint numWaits, + cl_event *waits, cl_event *outEvent); - virtual const char * GetName( void ) const { return "MapImage"; } + virtual const char *GetName(void) const { return "MapImage"; } }; diff --git a/test_conformance/events/main.cpp b/test_conformance/events/main.cpp index 777d2d36..74682f99 100644 --- a/test_conformance/events/main.cpp +++ b/test_conformance/events/main.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -24,44 +24,44 @@ #endif test_definition test_list[] = { - ADD_TEST( event_get_execute_status ), - ADD_TEST( event_get_write_array_status ), - ADD_TEST( event_get_read_array_status ), - ADD_TEST( event_get_info ), - ADD_TEST( event_wait_for_execute ), - ADD_TEST( event_wait_for_array ), - ADD_TEST( event_flush ), - ADD_TEST( event_finish_execute ), - ADD_TEST( event_finish_array ), - ADD_TEST( event_release_before_done ), - ADD_TEST( event_enqueue_marker ), + ADD_TEST(event_get_execute_status), + ADD_TEST(event_get_write_array_status), + ADD_TEST(event_get_read_array_status), + ADD_TEST(event_get_info), + ADD_TEST(event_wait_for_execute), + ADD_TEST(event_wait_for_array), + ADD_TEST(event_flush), + ADD_TEST(event_finish_execute), + ADD_TEST(event_finish_array), + ADD_TEST(event_release_before_done), + ADD_TEST(event_enqueue_marker), #ifdef CL_VERSION_1_2 - ADD_TEST( event_enqueue_marker_with_event_list ), - ADD_TEST( event_enqueue_barrier_with_event_list ), + ADD_TEST(event_enqueue_marker_with_event_list), + ADD_TEST(event_enqueue_barrier_with_event_list), #endif - ADD_TEST( out_of_order_event_waitlist_single_queue ), - ADD_TEST( out_of_order_event_waitlist_multi_queue ), - ADD_TEST( out_of_order_event_waitlist_multi_queue_multi_device ), - ADD_TEST( out_of_order_event_enqueue_wait_for_events_single_queue ), - ADD_TEST( out_of_order_event_enqueue_wait_for_events_multi_queue ), - ADD_TEST( out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device ), - ADD_TEST( out_of_order_event_enqueue_marker_single_queue ), - ADD_TEST( out_of_order_event_enqueue_marker_multi_queue ), - ADD_TEST( out_of_order_event_enqueue_marker_multi_queue_multi_device ), - ADD_TEST( out_of_order_event_enqueue_barrier_single_queue ), + ADD_TEST(out_of_order_event_waitlist_single_queue), + ADD_TEST(out_of_order_event_waitlist_multi_queue), + ADD_TEST(out_of_order_event_waitlist_multi_queue_multi_device), + ADD_TEST(out_of_order_event_enqueue_wait_for_events_single_queue), + ADD_TEST(out_of_order_event_enqueue_wait_for_events_multi_queue), + ADD_TEST( + out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device), + ADD_TEST(out_of_order_event_enqueue_marker_single_queue), + ADD_TEST(out_of_order_event_enqueue_marker_multi_queue), + ADD_TEST(out_of_order_event_enqueue_marker_multi_queue_multi_device), + ADD_TEST(out_of_order_event_enqueue_barrier_single_queue), - ADD_TEST( waitlists ), - ADD_TEST( userevents ), - ADD_TEST( callbacks ), - ADD_TEST( callbacks_simultaneous ), - ADD_TEST( userevents_multithreaded ), + ADD_TEST(waitlists), + ADD_TEST(userevents), + ADD_TEST(callbacks), + ADD_TEST(callbacks_simultaneous), + ADD_TEST(userevents_multithreaded), }; -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); int main(int argc, const char *argv[]) { return runTestHarness(argc, argv, test_num, test_list, false, 0); } - diff --git a/test_conformance/events/procs.h b/test_conformance/events/procs.h index f077c247..97309db3 100644 --- a/test_conformance/events/procs.h +++ b/test_conformance/events/procs.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -18,44 +18,101 @@ #include "harness/typeWrappers.h" #include "harness/clImageHelper.h" -extern float random_float(float low, float high); -extern float calculate_ulperror(float a, float b); - - -extern int test_event_get_execute_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_get_write_array_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_get_read_array_status(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_get_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_wait_for_execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_wait_for_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_flush(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_finish_execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_finish_array(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_release_before_done(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_enqueue_marker(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -#ifdef CL_VERSION_1_2 -extern int test_event_enqueue_marker_with_event_list(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_event_enqueue_barrier_with_event_list(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -#endif +extern float random_float(float low, float high); +extern float calculate_ulperror(float a, float b); -extern int test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_waitlist_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_waitlist_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_enqueue_wait_for_events_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_enqueue_wait_for_events_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_event_get_execute_status(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_get_write_array_status(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_get_read_array_status(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_get_info(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_event_wait_for_execute(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_wait_for_array(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_event_flush(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_event_finish_execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_event_finish_array(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_event_release_before_done(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_enqueue_marker(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +#ifdef CL_VERSION_1_2 +extern int test_event_enqueue_marker_with_event_list(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_event_enqueue_barrier_with_event_list(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +#endif -extern int test_out_of_order_event_enqueue_barrier_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_waitlist_multi_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_waitlist_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); -extern int test_out_of_order_event_enqueue_marker_single_queue(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_enqueue_marker_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); -extern int test_out_of_order_event_enqueue_marker_multi_queue_multi_device(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_out_of_order_event_enqueue_wait_for_events_single_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_enqueue_wait_for_events_multi_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int +test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); -extern int test_waitlists( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ); -extern int test_userevents( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ); -extern int test_callbacks( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ); -extern int test_callbacks_simultaneous( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ); -extern int test_userevents_multithreaded( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ); +extern int test_out_of_order_event_enqueue_barrier_single_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_enqueue_marker_single_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_enqueue_marker_multi_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_out_of_order_event_enqueue_marker_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_waitlists(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_userevents(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_callbacks(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_callbacks_simultaneous(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_userevents_multithreaded(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements); diff --git a/test_conformance/events/testBase.h b/test_conformance/events/testBase.h index 5b49bfd7..63086d7e 100644 --- a/test_conformance/events/testBase.h +++ b/test_conformance/events/testBase.h @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -26,6 +26,3 @@ #include "procs.h" #endif // _testBase_h - - - diff --git a/test_conformance/events/test_callbacks.cpp b/test_conformance/events/test_callbacks.cpp index 2ffb9ca7..04481dec 100644 --- a/test_conformance/events/test_callbacks.cpp +++ b/test_conformance/events/test_callbacks.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -18,28 +18,34 @@ #include "harness/conversions.h" #include "harness/ThreadPool.h" -#if !defined (_MSC_VER) +#if !defined(_MSC_VER) #include <unistd.h> #endif // !_MSC_VER -extern const char *IGetStatusString( cl_int status ); +extern const char *IGetStatusString(cl_int status); #define PRINT_OPS 0 -// Yes, this is somewhat nasty, in that we're relying on the CPU (the real CPU, not the OpenCL device) -// to be atomic w.r.t. boolean values. Although if it isn't, we'll just miss the check on this bool -// until the next time around, so it's not that big of a deal. Ideally, we'd be using a semaphore with -// a trywait on it, but then that introduces the fun issue of what to do on Win32, etc. This way is -// far more portable, and worst case of failure is a slightly longer test run. +// Yes, this is somewhat nasty, in that we're relying on the CPU (the real CPU, +// not the OpenCL device) to be atomic w.r.t. boolean values. Although if it +// isn't, we'll just miss the check on this bool until the next time around, so +// it's not that big of a deal. Ideally, we'd be using a semaphore with a +// trywait on it, but then that introduces the fun issue of what to do on Win32, +// etc. This way is far more portable, and worst case of failure is a slightly +// longer test run. static bool sCallbackTriggered = false; #define EVENT_CALLBACK_TYPE_TOTAL 3 -static bool sCallbackTriggered_flag[ EVENT_CALLBACK_TYPE_TOTAL ] ={ false,false, false }; -cl_int event_callback_types[EVENT_CALLBACK_TYPE_TOTAL] ={ CL_SUBMITTED, CL_RUNNING, CL_COMPLETE}; +static bool sCallbackTriggered_flag[EVENT_CALLBACK_TYPE_TOTAL] = { false, false, + false }; +cl_int event_callback_types[EVENT_CALLBACK_TYPE_TOTAL] = { CL_SUBMITTED, + CL_RUNNING, + CL_COMPLETE }; // Our callback function -/*void CL_CALLBACK single_event_callback_function( cl_event event, cl_int commandStatus, void * userData ) +/*void CL_CALLBACK single_event_callback_function( cl_event event, cl_int +commandStatus, void * userData ) { int i=*static_cast<int *>(userData); log_info( "\tEvent callback %d triggered\n", i); @@ -47,295 +53,322 @@ cl_int event_callback_types[EVENT_CALLBACK_TYPE_TOTAL] ={ CL_SUBMITTED, CL_RUNNI }*/ /* use struct as call back para */ -typedef struct { cl_int enevt_type; int index; } CALL_BACK_USER_DATA; +typedef struct +{ + cl_int event_type; + int index; +} CALL_BACK_USER_DATA; -void CL_CALLBACK single_event_callback_function_flags( cl_event event, cl_int commandStatus, void * userData ) +void CL_CALLBACK single_event_callback_function_flags(cl_event event, + cl_int commandStatus, + void *userData) { - // int i=*static_cast<int *>(userData); - CALL_BACK_USER_DATA *pdata= static_cast<CALL_BACK_USER_DATA *>(userData); + // int i=*static_cast<int *>(userData); + CALL_BACK_USER_DATA *pdata = static_cast<CALL_BACK_USER_DATA *>(userData); - log_info( "\tEvent callback %d of type %d triggered\n", pdata->index, pdata->enevt_type); - sCallbackTriggered_flag [pdata->index ] = true; + log_info("\tEvent callback %d of type %d triggered\n", pdata->index, + pdata->event_type); + sCallbackTriggered_flag[pdata->index] = true; } -int test_callback_event_single( cl_device_id device, cl_context context, cl_command_queue queue, Action *actionToTest ) +int test_callback_event_single(cl_device_id device, cl_context context, + cl_command_queue queue, Action *actionToTest) { - // Note: we don't use the waiting feature here. We just want to verify that we get a callback called - // when the given event finishes + // Note: we don't use the waiting feature here. We just want to verify that + // we get a callback called when the given event finishes - cl_int error = actionToTest->Setup( device, context, queue ); - test_error( error, "Unable to set up test action" ); + cl_int error = actionToTest->Setup(device, context, queue); + test_error(error, "Unable to set up test action"); // Set up a user event, which we use as a gate for the second event - clEventWrapper gateEvent = clCreateUserEvent( context, &error ); - test_error( error, "Unable to set up user gate event" ); + clEventWrapper gateEvent = clCreateUserEvent(context, &error); + test_error(error, "Unable to set up user gate event"); // Set up the execution of the action with its actual event clEventWrapper actualEvent; - error = actionToTest->Execute( queue, 1, &gateEvent, &actualEvent ); - test_error( error, "Unable to set up action execution" ); + error = actionToTest->Execute(queue, 1, &gateEvent, &actualEvent); + test_error(error, "Unable to set up action execution"); // Set up the callback on the actual event - /* use struct as call back para */ - CALL_BACK_USER_DATA user_data[EVENT_CALLBACK_TYPE_TOTAL]; - int index [EVENT_CALLBACK_TYPE_TOTAL]={ 0,1,2}; - for( int i=0;i< EVENT_CALLBACK_TYPE_TOTAL; i++) - { - user_data[i].enevt_type=event_callback_types[i]; - user_data[i].index =i; - error = clSetEventCallback( actualEvent, event_callback_types[i], single_event_callback_function_flags, user_data+i ); - - } + /* use struct as call back para */ + CALL_BACK_USER_DATA user_data[EVENT_CALLBACK_TYPE_TOTAL]; + for (int i = 0; i < EVENT_CALLBACK_TYPE_TOTAL; i++) + { + user_data[i].event_type = event_callback_types[i]; + user_data[i].index = i; + error = clSetEventCallback(actualEvent, event_callback_types[i], + single_event_callback_function_flags, + user_data + i); + } // Now release the user event, which will allow our actual action to run - error = clSetUserEventStatus( gateEvent, CL_COMPLETE ); - test_error( error, "Unable to trigger gate event" ); + error = clSetUserEventStatus(gateEvent, CL_COMPLETE); + test_error(error, "Unable to trigger gate event"); - // Now we wait for completion. Note that we can actually wait on the event itself, at least at first - error = clWaitForEvents( 1, &actualEvent ); - test_error( error, "Unable to wait for actual test event" ); + // Now we wait for completion. Note that we can actually wait on the event + // itself, at least at first + error = clWaitForEvents(1, &actualEvent); + test_error(error, "Unable to wait for actual test event"); - // Note: we can check our callback now, and it MIGHT have been triggered, but that's not guaranteed - if( sCallbackTriggered ) + // Note: we can check our callback now, and it MIGHT have been triggered, + // but that's not guaranteed + if (sCallbackTriggered) { // We're all good, so return success return 0; } - // The callback has not yet been called, but that doesn't mean it won't be. So wait for it - log_info( "\tWaiting for callback..." ); - fflush( stdout ); - for( int i = 0; i < 10 * 10; i++ ) + // The callback has not yet been called, but that doesn't mean it won't be. + // So wait for it + log_info("\tWaiting for callback..."); + fflush(stdout); + for (int i = 0; i < 10 * 10; i++) { - usleep( 100000 ); // 1/10th second + usleep(100000); // 1/10th second - int cc=0; - for( int k=0;k< EVENT_CALLBACK_TYPE_TOTAL;k++) - if (sCallbackTriggered_flag[k]) { - cc++; - } + int cc = 0; + for (int k = 0; k < EVENT_CALLBACK_TYPE_TOTAL; k++) + if (sCallbackTriggered_flag[k]) + { + cc++; + } - if (cc== EVENT_CALLBACK_TYPE_TOTAL ) + if (cc == EVENT_CALLBACK_TYPE_TOTAL) { - log_info( "\n" ); + log_info("\n"); return 0; } - log_info( "." ); - fflush( stdout ); + log_info("."); + fflush(stdout); } // If we got here, we never got the callback - log_error( "\nCallback not called within 10 seconds! (assuming failure)\n" ); + log_error("\nCallback not called within 10 seconds! (assuming failure)\n"); return -1; } -#define TEST_ACTION( name ) \ -{ \ - name##Action action; \ - log_info( "-- Testing " #name "...\n" ); \ - if( ( error = test_callback_event_single( deviceID, context, queue, &action ) ) != CL_SUCCESS ) \ - retVal++; \ - clFinish( queue ); \ -} +#define TEST_ACTION(name) \ + { \ + name##Action action; \ + log_info("-- Testing " #name "...\n"); \ + if ((error = test_callback_event_single(deviceID, context, queue, \ + &action)) \ + != CL_SUCCESS) \ + retVal++; \ + clFinish(queue); \ + } -int test_callbacks( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_callbacks(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int error; int retVal = 0; - log_info( "\n" ); + log_info("\n"); - TEST_ACTION( NDRangeKernel ) + TEST_ACTION(NDRangeKernel) - TEST_ACTION( ReadBuffer ) - TEST_ACTION( WriteBuffer ) - TEST_ACTION( MapBuffer ) - TEST_ACTION( UnmapBuffer ) + TEST_ACTION(ReadBuffer) + TEST_ACTION(WriteBuffer) + TEST_ACTION(MapBuffer) + TEST_ACTION(UnmapBuffer) - if( checkForImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED ) + if (checkForImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED) { - log_info( "\nNote: device does not support images. Skipping remainder of callback tests...\n" ); + log_info("\nNote: device does not support images. Skipping remainder " + "of callback tests...\n"); } else { - TEST_ACTION( ReadImage2D ) - TEST_ACTION( WriteImage2D ) - TEST_ACTION( CopyImage2Dto2D ) - TEST_ACTION( Copy2DImageToBuffer ) - TEST_ACTION( CopyBufferTo2DImage ) - TEST_ACTION( MapImage ) - - if( checkFor3DImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED ) - log_info( "\nNote: device does not support 3D images. Skipping remainder of waitlist tests...\n" ); + TEST_ACTION(ReadImage2D) + TEST_ACTION(WriteImage2D) + TEST_ACTION(CopyImage2Dto2D) + TEST_ACTION(Copy2DImageToBuffer) + TEST_ACTION(CopyBufferTo2DImage) + TEST_ACTION(MapImage) + + if (checkFor3DImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED) + log_info("\nNote: device does not support 3D images. Skipping " + "remainder of waitlist tests...\n"); else { - TEST_ACTION( ReadImage3D ) - TEST_ACTION( WriteImage3D ) - TEST_ACTION( CopyImage2Dto3D ) - TEST_ACTION( CopyImage3Dto2D ) - TEST_ACTION( CopyImage3Dto3D ) - TEST_ACTION( Copy3DImageToBuffer ) - TEST_ACTION( CopyBufferTo3DImage ) + TEST_ACTION(ReadImage3D) + TEST_ACTION(WriteImage3D) + TEST_ACTION(CopyImage2Dto3D) + TEST_ACTION(CopyImage3Dto2D) + TEST_ACTION(CopyImage3Dto3D) + TEST_ACTION(Copy3DImageToBuffer) + TEST_ACTION(CopyBufferTo3DImage) } } return retVal; } -#define SIMUTANEOUS_ACTION_TOTAL 18 -static bool sSimultaneousFlags[ 54 ];// for 18 actions with 3 callback status +#define SIMUTANEOUS_ACTION_TOTAL 18 +static bool sSimultaneousFlags[54]; // for 18 actions with 3 callback status static volatile int sSimultaneousCount; -Action * actions[ 19 ] = { 0 }; +Action *actions[19] = { 0 }; // Callback for the simultaneous tests -void CL_CALLBACK simultaneous_event_callback_function( cl_event event, cl_int commandStatus, void * userData ) +void CL_CALLBACK simultaneous_event_callback_function(cl_event event, + cl_int commandStatus, + void *userData) { int eventIndex = (int)(size_t)userData; - int actionIndex = eventIndex/EVENT_CALLBACK_TYPE_TOTAL; - int statusIndex = eventIndex%EVENT_CALLBACK_TYPE_TOTAL; - log_info( "\tEvent callback triggered for action %s callback type %s \n", actions[actionIndex]->GetName(), IGetStatusString(statusIndex) ); - sSimultaneousFlags[ actionIndex ] = true; - ThreadPool_AtomicAdd(&sSimultaneousCount,1); + int actionIndex = eventIndex / EVENT_CALLBACK_TYPE_TOTAL; + int statusIndex = eventIndex % EVENT_CALLBACK_TYPE_TOTAL; + log_info("\tEvent callback triggered for action %s callback type %s \n", + actions[actionIndex]->GetName(), IGetStatusString(statusIndex)); + sSimultaneousFlags[actionIndex] = true; + ThreadPool_AtomicAdd(&sSimultaneousCount, 1); } -int test_callbacks_simultaneous( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_callbacks_simultaneous(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int error; - // Unlike the singles test, in this one, we run a bunch of events all at once, to verify that - // the callbacks do get called once-and-only-once for each event, even if the run out of order or - // are dependent on each other + // Unlike the singles test, in this one, we run a bunch of events all at + // once, to verify that the callbacks do get called once-and-only-once for + // each event, even if the run out of order or are dependent on each other // First, the list of actions to run int actionCount = 0, index = 0; - actions[ index++ ] = new NDRangeKernelAction(); - actions[ index++ ] = new ReadBufferAction(); - actions[ index++ ] = new WriteBufferAction(); - actions[ index++ ] = new MapBufferAction(); - actions[ index++ ] = new UnmapBufferAction(); + actions[index++] = new NDRangeKernelAction(); + actions[index++] = new ReadBufferAction(); + actions[index++] = new WriteBufferAction(); + actions[index++] = new MapBufferAction(); + actions[index++] = new UnmapBufferAction(); - if( checkForImageSupport( deviceID ) != CL_IMAGE_FORMAT_NOT_SUPPORTED ) + if (checkForImageSupport(deviceID) != CL_IMAGE_FORMAT_NOT_SUPPORTED) { - actions[ index++ ] = new ReadImage2DAction(); - actions[ index++ ] = new WriteImage2DAction(); - actions[ index++ ] = new CopyImage2Dto2DAction(); - actions[ index++ ] = new Copy2DImageToBufferAction(); - actions[ index++ ] = new CopyBufferTo2DImageAction(); - actions[ index++ ] = new MapImageAction(); - - if( checkFor3DImageSupport( deviceID ) != CL_IMAGE_FORMAT_NOT_SUPPORTED ) + actions[index++] = new ReadImage2DAction(); + actions[index++] = new WriteImage2DAction(); + actions[index++] = new CopyImage2Dto2DAction(); + actions[index++] = new Copy2DImageToBufferAction(); + actions[index++] = new CopyBufferTo2DImageAction(); + actions[index++] = new MapImageAction(); + + if (checkFor3DImageSupport(deviceID) != CL_IMAGE_FORMAT_NOT_SUPPORTED) { - actions[ index++ ] = new ReadImage3DAction(); - actions[ index++ ] = new WriteImage3DAction(); - actions[ index++ ] = new CopyImage2Dto3DAction(); - actions[ index++ ] = new CopyImage3Dto2DAction(); - actions[ index++ ] = new CopyImage3Dto3DAction(); - actions[ index++ ] = new Copy3DImageToBufferAction(); - actions[ index++ ] = new CopyBufferTo3DImageAction(); + actions[index++] = new ReadImage3DAction(); + actions[index++] = new WriteImage3DAction(); + actions[index++] = new CopyImage2Dto3DAction(); + actions[index++] = new CopyImage3Dto2DAction(); + actions[index++] = new CopyImage3Dto3DAction(); + actions[index++] = new Copy3DImageToBufferAction(); + actions[index++] = new CopyBufferTo3DImageAction(); } } actionCount = index; - actions[ index++ ] = NULL; + actions[index++] = NULL; // Now set them all up - log_info( "\tSetting up test events...\n" ); - for( index = 0; actions[ index ] != NULL; index++ ) + log_info("\tSetting up test events...\n"); + for (index = 0; actions[index] != NULL; index++) { - error = actions[ index ]->Setup( deviceID, context, queue ); - test_error( error, "Unable to set up test action" ); - sSimultaneousFlags[ index ] = false; + error = actions[index]->Setup(deviceID, context, queue); + test_error(error, "Unable to set up test action"); + sSimultaneousFlags[index] = false; } sSimultaneousCount = 0; // Set up the user event to start them all - clEventWrapper gateEvent = clCreateUserEvent( context, &error ); - test_error( error, "Unable to set up user gate event" ); + clEventWrapper gateEvent = clCreateUserEvent(context, &error); + test_error(error, "Unable to set up user gate event"); // Start executing, all tied to the gate event - //clEventWrapper actionEvents[ 18 ];// current actionCount is 18 - clEventWrapper *actionEvents= new clEventWrapper[actionCount]; + // clEventWrapper actionEvents[ 18 ];// current actionCount is 18 + clEventWrapper *actionEvents = new clEventWrapper[actionCount]; if (actionEvents == NULL) { log_error(" memory error in test_callbacks_simultaneous \n"); - for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i) - if (actions[i]) delete actions[i]; - return -1; + for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i) + if (actions[i]) delete actions[i]; + return -1; } - RandomSeed seed( gRandomSeed ); - for( index = 0; actions[ index ] != NULL; index++ ) + RandomSeed seed(gRandomSeed); + for (index = 0; actions[index] != NULL; index++) { // Randomly choose to wait on the gate, or wait on the previous event - cl_event * eventPtr = &gateEvent; - if( ( index > 0 ) && ( random_in_range( 0, 255, seed ) & 1 ) ) - eventPtr = &actionEvents[ index - 1 ]; - - error = actions[ index ]->Execute( queue, 1, eventPtr, &actionEvents[ index ] ); - test_error( error, "Unable to execute test action" ); + cl_event *eventPtr = &gateEvent; + if ((index > 0) && (random_in_range(0, 255, seed) & 1)) + eventPtr = &actionEvents[index - 1]; + error = + actions[index]->Execute(queue, 1, eventPtr, &actionEvents[index]); + test_error(error, "Unable to execute test action"); - for( int k=0; k< EVENT_CALLBACK_TYPE_TOTAL; k++) - { - error = clSetEventCallback( actionEvents[index], event_callback_types[k], simultaneous_event_callback_function, (void *)(size_t)(index*EVENT_CALLBACK_TYPE_TOTAL+k ) ); - test_error( error, "Unable to set event callback function" ); - } + for (int k = 0; k < EVENT_CALLBACK_TYPE_TOTAL; k++) + { + error = clSetEventCallback( + actionEvents[index], event_callback_types[k], + simultaneous_event_callback_function, + (void *)(size_t)(index * EVENT_CALLBACK_TYPE_TOTAL + k)); + test_error(error, "Unable to set event callback function"); + } } - int total_callbacks= actionCount * EVENT_CALLBACK_TYPE_TOTAL; + int total_callbacks = actionCount * EVENT_CALLBACK_TYPE_TOTAL; // Now release the user event, which will allow our actual action to run - error = clSetUserEventStatus( gateEvent, CL_COMPLETE ); - test_error( error, "Unable to trigger gate event" ); + error = clSetUserEventStatus(gateEvent, CL_COMPLETE); + test_error(error, "Unable to trigger gate event"); // Wait on the actual action events now - log_info( "\tWaiting for test completions...\n" ); - error = clWaitForEvents( actionCount, &actionEvents[ 0 ] ); - test_error( error, "Unable to wait for actual test events" ); - - // Note: we can check our callback now, and it MIGHT have been triggered, but that's not guaranteed - int last_count = 0; - if( ((last_count = sSimultaneousCount)) == total_callbacks) + log_info("\tWaiting for test completions...\n"); + error = clWaitForEvents(actionCount, &actionEvents[0]); + test_error(error, "Unable to wait for actual test events"); + + // Note: we can check our callback now, and it MIGHT have been triggered, + // but that's not guaranteed + int last_count = 0; + if (((last_count = sSimultaneousCount)) == total_callbacks) { // We're all good, so return success - log_info( "\t%d of %d callbacks received\n", sSimultaneousCount, total_callbacks ); + log_info("\t%d of %d callbacks received\n", sSimultaneousCount, + total_callbacks); - if (actionEvents) delete [] actionEvents; - for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i) - if (actions[i]) delete actions[i]; + if (actionEvents) delete[] actionEvents; + for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i) + if (actions[i]) delete actions[i]; return 0; } // We haven't gotten (all) of the callbacks, so wait for them - log_info( "\tWe've only received %d of the %d callbacks we expected; waiting for more...\n", last_count, total_callbacks ); + log_info("\tWe've only received %d of the %d callbacks we expected; " + "waiting for more...\n", + last_count, total_callbacks); - for( int i = 0; i < 10 * 10; i++ ) + for (int i = 0; i < 10 * 10; i++) { - usleep( 100000 ); // 1/10th second - if( ((last_count = sSimultaneousCount)) == total_callbacks ) + usleep(100000); // 1/10th second + if (((last_count = sSimultaneousCount)) == total_callbacks) { - // All of the callbacks were executed - if (actionEvents) delete [] actionEvents; - for (size_t i=0;i<(sizeof(actions)/sizeof(actions[0]));++i) - if (actions[i]) delete actions[i]; - return 0; + // All of the callbacks were executed + if (actionEvents) delete[] actionEvents; + for (size_t i = 0; i < (sizeof(actions) / sizeof(actions[0])); ++i) + if (actions[i]) delete actions[i]; + return 0; } } // If we got here, some of the callbacks did not occur in time - log_error( "\nError: We only ever received %d of our %d callbacks!\n", last_count, total_callbacks ); - log_error( "Events that did not receive callbacks:\n" ); - for( index = 0; actions[ index ] != NULL; index++ ) + log_error("\nError: We only ever received %d of our %d callbacks!\n", + last_count, total_callbacks); + log_error("Events that did not receive callbacks:\n"); + for (index = 0; actions[index] != NULL; index++) { - if( !sSimultaneousFlags[ index ] ) - log_error( "\t%s\n", actions[ index ]->GetName() ); + if (!sSimultaneousFlags[index]) + log_error("\t%s\n", actions[index]->GetName()); } - if (actionEvents) delete [] actionEvents; + if (actionEvents) delete[] actionEvents; return -1; - } - diff --git a/test_conformance/events/test_event_dependencies.cpp b/test_conformance/events/test_event_dependencies.cpp index 41136548..45b260a6 100644 --- a/test_conformance/events/test_event_dependencies.cpp +++ b/test_conformance/events/test_event_dependencies.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -39,61 +39,79 @@ const char *write_kernels[] = { /* Tests event dependencies by running two kernels that use the same buffer. If two_queues is set they are run in separate queues. - If test_enqueue_wait_for_events is set then clEnqueueWaitForEvent is called between them. - If test_barrier is set then clEnqueueBarrier is called between them (only for single queue). - If neither are set, nothing is done to prevent them from executing in the wrong order. This can be used for verification. + If test_enqueue_wait_for_events is set then clEnqueueWaitForEvent is called + between them. If test_barrier is set then clEnqueueBarrier is called between + them (only for single queue). If neither are set, nothing is done to prevent + them from executing in the wrong order. This can be used for verification. */ -int test_event_enqueue_wait_for_events_run_test( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, int two_queues, int two_devices, - int test_enqueue_wait_for_events, int test_barrier, int use_waitlist, int use_marker) +int test_event_enqueue_wait_for_events_run_test( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements, int two_queues, int two_devices, + int test_enqueue_wait_for_events, int test_barrier, int use_waitlist, + int use_marker) { cl_int error = CL_SUCCESS; - size_t threads[3] = {TEST_SIZE,0,0}; + size_t threads[3] = { TEST_SIZE, 0, 0 }; int i, loop_count, event_count, expected_value, failed; int expected_if_only_queue[2]; int max_count = TEST_SIZE; cl_platform_id platform; - cl_command_queue queues[2]; // Not a wrapper so we don't autorelease if they are the same - clCommandQueueWrapper queueWrappers[2]; // If they are different, we use the wrapper so it will auto release + cl_command_queue + queues[2]; // Not a wrapper so we don't autorelease if they are the same + clCommandQueueWrapper queueWrappers[2]; // If they are different, we use the + // wrapper so it will auto release clContextWrapper context_to_use; clMemWrapper data; clProgramWrapper program; clKernelWrapper kernel1[TEST_COUNT], kernel2[TEST_COUNT]; - clEventWrapper event[TEST_COUNT*4+2]; // If we usemarkers we get 2 more events per iteration + clEventWrapper event[TEST_COUNT * 4 + 2]; // If we usemarkers we get 2 more + // events per iteration if (test_enqueue_wait_for_events) - log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier function.\n"); + log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier " + "function.\n"); if (test_barrier) - log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier function.\n"); + log_info("\tTesting with clEnqueueBarrierWithWaitList as barrier " + "function.\n"); if (use_waitlist) - log_info("\tTesting with waitlist-based depenednecies between kernels.\n"); + log_info( + "\tTesting with waitlist-based depenednecies between kernels.\n"); if (use_marker) log_info("\tTesting with clEnqueueMarker as a barrier function.\n"); - if (test_barrier && (two_queues || two_devices)) { - log_error("\tTest requested with clEnqueueBarrier across two queues. This is not a valid combination.\n"); + if (test_barrier && (two_queues || two_devices)) + { + log_error("\tTest requested with clEnqueueBarrier across two queues. " + "This is not a valid combination.\n"); return -1; } error = clGetPlatformIDs(1, &platform, NULL); test_error(error, "clGetPlatformIDs failed."); - // If we are to use two devices, then get them and create a context with both. + // If we are to use two devices, then get them and create a context with + // both. cl_device_id *two_device_ids; - if (two_devices) { - two_device_ids = (cl_device_id*)malloc(sizeof(cl_device_id)*2); + if (two_devices) + { + two_device_ids = (cl_device_id *)malloc(sizeof(cl_device_id) * 2); cl_uint number_returned; - error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, two_device_ids, &number_returned); - test_error( error, "clGetDeviceIDs for CL_DEVICE_TYPE_ALL failed."); - if (number_returned != 2) { + error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, two_device_ids, + &number_returned); + test_error(error, "clGetDeviceIDs for CL_DEVICE_TYPE_ALL failed."); + if (number_returned != 2) + { log_info("Failed to obtain two devices. Test can not run.\n"); free(two_device_ids); return 0; } - for (i=0; i<2; i++) { + for (i = 0; i < 2; i++) + { cl_device_type type; - error = clGetDeviceInfo(two_device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); - test_error( error, "clGetDeviceInfo failed."); + error = clGetDeviceInfo(two_device_ids[i], CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL); + test_error(error, "clGetDeviceInfo failed."); if (type & CL_DEVICE_TYPE_CPU) log_info("\tDevice %d is CL_DEVICE_TYPE_CPU.\n", i); if (type & CL_DEVICE_TYPE_GPU) @@ -104,12 +122,16 @@ int test_event_enqueue_wait_for_events_run_test( cl_device_id deviceID, cl_conte log_info("\tDevice %d is CL_DEVICE_TYPE_DEFAULT.\n", i); } - context_to_use = clCreateContext(NULL, 2, two_device_ids, notify_callback, NULL, &error); + context_to_use = clCreateContext(NULL, 2, two_device_ids, + notify_callback, NULL, &error); test_error(error, "clCreateContext failed for two devices."); log_info("\tTesting with two devices.\n"); - } else { - context_to_use = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &error); + } + else + { + context_to_use = + clCreateContext(NULL, 1, &deviceID, NULL, NULL, &error); test_error(error, "clCreateContext failed for one device."); log_info("\tTesting with one device.\n"); @@ -117,41 +139,55 @@ int test_event_enqueue_wait_for_events_run_test( cl_device_id deviceID, cl_conte // If we are using two queues then create them cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - if (two_queues) { + if (two_queues) + { // Get a second queue if (two_devices) { - if( !checkDeviceForQueueSupport( two_device_ids[ 0 ], props ) || - !checkDeviceForQueueSupport( two_device_ids[ 1 ], props ) ) + if (!checkDeviceForQueueSupport(two_device_ids[0], props) + || !checkDeviceForQueueSupport(two_device_ids[1], props)) { - log_info( "WARNING: One or more device for multi-device test does not support out-of-order exec mode; skipping test.\n" ); + log_info( + "WARNING: One or more device for multi-device test does " + "not support out-of-order exec mode; skipping test.\n"); return -1942; } - queueWrappers[0] = clCreateCommandQueue(context_to_use, two_device_ids[0], props, &error); - test_error(error, "clCreateCommandQueue for first queue on first device failed."); - queueWrappers[1] = clCreateCommandQueue(context_to_use, two_device_ids[1], props, &error); - test_error(error, "clCreateCommandQueue for second queue on second device failed."); - + queueWrappers[0] = clCreateCommandQueue( + context_to_use, two_device_ids[0], props, &error); + test_error( + error, + "clCreateCommandQueue for first queue on first device failed."); + queueWrappers[1] = clCreateCommandQueue( + context_to_use, two_device_ids[1], props, &error); + test_error(error, + "clCreateCommandQueue for second queue on second device " + "failed."); } else { - // Single device has already been checked for out-of-order exec support - queueWrappers[0] = clCreateCommandQueue(context_to_use, deviceID, props, &error); + // Single device has already been checked for out-of-order exec + // support + queueWrappers[0] = + clCreateCommandQueue(context_to_use, deviceID, props, &error); test_error(error, "clCreateCommandQueue for first queue failed."); - queueWrappers[1] = clCreateCommandQueue(context_to_use, deviceID, props, &error); + queueWrappers[1] = + clCreateCommandQueue(context_to_use, deviceID, props, &error); test_error(error, "clCreateCommandQueue for second queue failed."); } - // Ugly hack to make sure we only have the wrapper auto-release if they are different queues + // Ugly hack to make sure we only have the wrapper auto-release if they + // are different queues queues[0] = queueWrappers[0]; queues[1] = queueWrappers[1]; log_info("\tTesting with two queues.\n"); } else { - // (Note: single device has already been checked for out-of-order exec support) - // Otherwise create one queue and have the second one be the same - queueWrappers[0] = clCreateCommandQueue(context_to_use, deviceID, props, &error); + // (Note: single device has already been checked for out-of-order exec + // support) Otherwise create one queue and have the second one be the + // same + queueWrappers[0] = + clCreateCommandQueue(context_to_use, deviceID, props, &error); test_error(error, "clCreateCommandQueue for first queue failed."); queues[0] = queueWrappers[0]; queues[1] = (cl_command_queue)queues[0]; @@ -160,236 +196,346 @@ int test_event_enqueue_wait_for_events_run_test( cl_device_id deviceID, cl_conte // Setup - create a buffer and the two kernels - data = clCreateBuffer(context_to_use, CL_MEM_READ_WRITE, TEST_SIZE*sizeof(cl_int), NULL, &error); - test_error( error, "clCreateBuffer failed"); + data = clCreateBuffer(context_to_use, CL_MEM_READ_WRITE, + TEST_SIZE * sizeof(cl_int), NULL, &error); + test_error(error, "clCreateBuffer failed"); // Initialize the values to zero - cl_int *values = (cl_int*)malloc(TEST_SIZE*sizeof(cl_int)); - for (i=0; i<(int)TEST_SIZE; i++) - values[i] = 0; - error = clEnqueueWriteBuffer(queues[0], data, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), values, 0, NULL, NULL); - test_error( error, "clEnqueueWriteBuffer failed"); + cl_int *values = (cl_int *)malloc(TEST_SIZE * sizeof(cl_int)); + for (i = 0; i < (int)TEST_SIZE; i++) values[i] = 0; + error = + clEnqueueWriteBuffer(queues[0], data, CL_TRUE, 0, + TEST_SIZE * sizeof(cl_int), values, 0, NULL, NULL); + test_error(error, "clEnqueueWriteBuffer failed"); expected_value = 0; // Build the kernels - if (create_single_kernel_helper( context_to_use, &program, &kernel1[0], 1, write_kernels, "write_up" )) + if (create_single_kernel_helper(context_to_use, &program, &kernel1[0], 1, + write_kernels, "write_up")) return -1; error = clSetKernelArg(kernel1[0], 0, sizeof(data), &data); error |= clSetKernelArg(kernel1[0], 1, sizeof(max_count), &max_count); - test_error( error, "clSetKernelArg 1 failed"); + test_error(error, "clSetKernelArg 1 failed"); - for (i=1; i<TEST_COUNT; i++) { + for (i = 1; i < TEST_COUNT; i++) + { kernel1[i] = clCreateKernel(program, "write_up", &error); - test_error( error, "clCreateKernel 1 failed"); + test_error(error, "clCreateKernel 1 failed"); error = clSetKernelArg(kernel1[i], 0, sizeof(data), &data); error |= clSetKernelArg(kernel1[i], 1, sizeof(max_count), &max_count); - test_error( error, "clSetKernelArg 1 failed"); + test_error(error, "clSetKernelArg 1 failed"); } - for (i=0; i<TEST_COUNT; i++) { + for (i = 0; i < TEST_COUNT; i++) + { kernel2[i] = clCreateKernel(program, "write_down", &error); - test_error( error, "clCreateKernel 2 failed"); + test_error(error, "clCreateKernel 2 failed"); error = clSetKernelArg(kernel2[i], 0, sizeof(data), &data); error |= clSetKernelArg(kernel2[i], 1, sizeof(max_count), &max_count); - test_error( error, "clSetKernelArg 2 failed"); + test_error(error, "clSetKernelArg 2 failed"); } - // Execution - run the first kernel, then enqueue the wait on the events, then the second kernel - // If clEnqueueBarrierWithWaitList works, the buffer will be filled with 1s, then multiplied by 4s, - // then incremented to 5s, repeatedly. Otherwise the values may be 2s (if the first one doesn't work) or 8s - // (if the second one doesn't work). + // Execution - run the first kernel, then enqueue the wait on the events, + // then the second kernel If clEnqueueBarrierWithWaitList works, the buffer + // will be filled with 1s, then multiplied by 4s, then incremented to 5s, + // repeatedly. Otherwise the values may be 2s (if the first one doesn't + // work) or 8s (if the second one doesn't work). if (RANDOMIZE) log_info("Queues chosen randomly for each kernel execution.\n"); else log_info("Queues chosen alternatily for each kernel execution.\n"); event_count = 0; - for (i=0; i<(int)TEST_SIZE; i++) - values[i] = 1; - error = clEnqueueWriteBuffer(queues[0], data, CL_FALSE, 0, TEST_SIZE*sizeof(cl_int), values, 0, NULL, &event[event_count]); - test_error( error, "clEnqueueWriteBuffer 2 failed"); + for (i = 0; i < (int)TEST_SIZE; i++) values[i] = 1; + error = clEnqueueWriteBuffer(queues[0], data, CL_FALSE, 0, + TEST_SIZE * sizeof(cl_int), values, 0, NULL, + &event[event_count]); + test_error(error, "clEnqueueWriteBuffer 2 failed"); expected_value = 1; expected_if_only_queue[0] = 1; expected_if_only_queue[1] = 1; int queue_to_use = 1; - if (test_enqueue_wait_for_events) { - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL ); - test_error( error, "Unable to queue wait for events" ); - } else if (test_barrier) { - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL); - test_error( error, "Unable to queue barrier" ); + if (test_enqueue_wait_for_events) + { + error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, + &event[event_count], NULL); + test_error(error, "Unable to queue wait for events"); + } + else if (test_barrier) + { + error = + clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL); + test_error(error, "Unable to queue barrier"); } - for (loop_count=0; loop_count<TEST_COUNT; loop_count++) { + for (loop_count = 0; loop_count < TEST_COUNT; loop_count++) + { // Execute kernel 1 event_count++; - if (use_waitlist | use_marker) { - if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, NULL, threads, NULL, 1, &event[%d], &event[%d])\n", queue_to_use, loop_count, event_count-1, event_count); - error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, NULL, 1, &event[event_count-1], &event[event_count]); - } else { - if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, NULL, threads, NULL, 0, NULL, &event[%d])\n", queue_to_use, loop_count, event_count); - error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, NULL, 0, NULL, &event[event_count]); + if (use_waitlist | use_marker) + { + if (DEBUG_OUT) + log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, " + "NULL, threads, NULL, 1, &event[%d], &event[%d])\n", + queue_to_use, loop_count, event_count - 1, + event_count); + error = clEnqueueNDRangeKernel( + queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, + NULL, 1, &event[event_count - 1], &event[event_count]); } - if (error) { + else + { + if (DEBUG_OUT) + log_info("clEnqueueNDRangeKernel(queues[%d], kernel1[%d], 1, " + "NULL, threads, NULL, 0, NULL, &event[%d])\n", + queue_to_use, loop_count, event_count); + error = clEnqueueNDRangeKernel( + queues[queue_to_use], kernel1[loop_count], 1, NULL, threads, + NULL, 0, NULL, &event[event_count]); + } + if (error) + { log_info("\tLoop count %d\n", loop_count); - print_error( error, "clEnqueueNDRangeKernel for kernel 1 failed"); + print_error(error, "clEnqueueNDRangeKernel for kernel 1 failed"); return error; } expected_value *= 2; expected_if_only_queue[queue_to_use] *= 2; // If we are using a marker, it needs to go in the same queue - if (use_marker) { + if (use_marker) + { event_count++; - if (DEBUG_OUT) log_info("clEnqueueMarker(queues[%d], event[%d])\n", queue_to_use, event_count); - - #ifdef CL_VERSION_1_2 - error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, &event[event_count]); - #else - error = clEnqueueMarker(queues[queue_to_use], &event[event_count]); - #endif - + if (DEBUG_OUT) + log_info("clEnqueueMarker(queues[%d], event[%d])\n", + queue_to_use, event_count); + +#ifdef CL_VERSION_1_2 + error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, + &event[event_count]); +#else + error = clEnqueueMarker(queues[queue_to_use], &event[event_count]); +#endif } // Pick the next queue to run if (RANDOMIZE) - queue_to_use = rand()%2; + queue_to_use = rand() % 2; else - queue_to_use = (queue_to_use + 1)%2; + queue_to_use = (queue_to_use + 1) % 2; // Put in a barrier if requested - if (test_enqueue_wait_for_events) { - if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, &event[%d], NULL)\n", queue_to_use, event_count); - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL); - test_error( error, "Unable to queue wait for events" ); - } else if (test_barrier) { - if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", queue_to_use); - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL); - test_error( error, "Unable to queue barrier" ); + if (test_enqueue_wait_for_events) + { + if (DEBUG_OUT) + log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, " + "&event[%d], NULL)\n", + queue_to_use, event_count); + error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, + &event[event_count], NULL); + test_error(error, "Unable to queue wait for events"); + } + else if (test_barrier) + { + if (DEBUG_OUT) + log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", + queue_to_use); + error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, + NULL); + test_error(error, "Unable to queue barrier"); } // Execute Kernel 2 event_count++; - if (use_waitlist | use_marker) { - if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, NULL, threads, NULL, 1, &event[%d], &event[%d])\n", queue_to_use, loop_count, event_count-1, event_count); - error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, NULL, 1, &event[event_count-1], &event[event_count]); - } else { - if (DEBUG_OUT) log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, NULL, threads, NULL, 0, NULL, &event[%d])\n", queue_to_use, loop_count, event_count); - error = clEnqueueNDRangeKernel(queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, NULL, 0, NULL, &event[event_count]); + if (use_waitlist | use_marker) + { + if (DEBUG_OUT) + log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, " + "NULL, threads, NULL, 1, &event[%d], &event[%d])\n", + queue_to_use, loop_count, event_count - 1, + event_count); + error = clEnqueueNDRangeKernel( + queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, + NULL, 1, &event[event_count - 1], &event[event_count]); + } + else + { + if (DEBUG_OUT) + log_info("clEnqueueNDRangeKernel(queues[%d], kernel2[%d], 1, " + "NULL, threads, NULL, 0, NULL, &event[%d])\n", + queue_to_use, loop_count, event_count); + error = clEnqueueNDRangeKernel( + queues[queue_to_use], kernel2[loop_count], 1, NULL, threads, + NULL, 0, NULL, &event[event_count]); } - if (error) { + if (error) + { log_info("\tLoop count %d\n", loop_count); - print_error( error, "clEnqueueNDRangeKernel for kernel 2 failed"); + print_error(error, "clEnqueueNDRangeKernel for kernel 2 failed"); return error; } expected_value--; expected_if_only_queue[queue_to_use]--; // If we are using a marker, it needs to go in the same queue - if (use_marker) { + if (use_marker) + { event_count++; - if (DEBUG_OUT) log_info("clEnqueueMarker(queues[%d], event[%d])\n", queue_to_use, event_count); - - #ifdef CL_VERSION_1_2 - error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, &event[event_count]); - #else + if (DEBUG_OUT) + log_info("clEnqueueMarker(queues[%d], event[%d])\n", + queue_to_use, event_count); + +#ifdef CL_VERSION_1_2 + error = clEnqueueMarkerWithWaitList(queues[queue_to_use], 0, NULL, + &event[event_count]); +#else error = clEnqueueMarker(queues[queue_to_use], &event[event_count]); - #endif +#endif } // Pick the next queue to run if (RANDOMIZE) - queue_to_use = rand()%2; + queue_to_use = rand() % 2; else - queue_to_use = (queue_to_use + 1)%2; + queue_to_use = (queue_to_use + 1) % 2; // Put in a barrier if requested - if (test_enqueue_wait_for_events) { - if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, &event[%d], NULL)\n", queue_to_use, event_count); - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, &event[event_count], NULL ); - test_error( error, "Unable to queue wait for events" ); - } else if (test_barrier) { - if (DEBUG_OUT) log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", queue_to_use); - error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, NULL); - test_error( error, "Unable to queue barrier" ); + if (test_enqueue_wait_for_events) + { + if (DEBUG_OUT) + log_info("clEnqueueBarrierWithWaitList(queues[%d], 1, " + "&event[%d], NULL)\n", + queue_to_use, event_count); + error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 1, + &event[event_count], NULL); + test_error(error, "Unable to queue wait for events"); + } + else if (test_barrier) + { + if (DEBUG_OUT) + log_info("clEnqueueBarrierWithWaitList(queues[%d])\n", + queue_to_use); + error = clEnqueueBarrierWithWaitList(queues[queue_to_use], 0, NULL, + NULL); + test_error(error, "Unable to queue barrier"); } } // Now finish up everything - if (two_queues) { + if (two_queues) + { error = clFlush(queues[1]); - test_error( error, "clFlush[1] failed"); + test_error(error, "clFlush[1] failed"); } - error = clEnqueueReadBuffer(queues[0], data, CL_TRUE, 0, TEST_SIZE*sizeof(cl_int), values, 1, &event[event_count], NULL); + error = clEnqueueReadBuffer(queues[0], data, CL_TRUE, 0, + TEST_SIZE * sizeof(cl_int), values, 1, + &event[event_count], NULL); test_error(error, "clEnqueueReadBuffer failed"); failed = 0; - for (i=0; i<(int)TEST_SIZE; i++) - if (values[i] != expected_value) { + for (i = 0; i < (int)TEST_SIZE; i++) + if (values[i] != expected_value) + { failed = 1; - log_info("\tvalues[%d] = %d, expected %d (If only queue 1 accessed memory: %d only queue 2 accessed memory: %d)\n", - i, values[i], expected_value, expected_if_only_queue[0], expected_if_only_queue[1]); + log_info("\tvalues[%d] = %d, expected %d (If only queue 1 accessed " + "memory: %d only queue 2 accessed memory: %d)\n", + i, values[i], expected_value, expected_if_only_queue[0], + expected_if_only_queue[1]); break; } free(values); - if (two_devices) - free(two_device_ids); + if (two_devices) free(two_device_ids); return failed; } -int test( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, - int two_queues, int two_devices, - int test_enqueue_wait_for_events, int test_barrier, int use_waitlists, int use_marker) +int test(cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements, int two_queues, int two_devices, + int test_enqueue_wait_for_events, int test_barrier, int use_waitlists, + int use_marker) { - if( !checkDeviceForQueueSupport( deviceID, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) ) + if (!checkDeviceForQueueSupport(deviceID, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { - log_info( "WARNING: Device does not support out-of-order exec mode; skipping test.\n" ); + log_info("WARNING: Device does not support out-of-order exec mode; " + "skipping test.\n"); return 0; } - log_info("Running test for baseline results to determine if out-of-order execution can be detected...\n"); - int baseline_results = test_event_enqueue_wait_for_events_run_test(deviceID, context, queue, num_elements, two_queues, two_devices, 0, 0, 0, 0); - if (baseline_results == 0) { + log_info("Running test for baseline results to determine if out-of-order " + "execution can be detected...\n"); + int baseline_results = test_event_enqueue_wait_for_events_run_test( + deviceID, context, queue, num_elements, two_queues, two_devices, 0, 0, + 0, 0); + if (baseline_results == 0) + { if (test_enqueue_wait_for_events) - log_info("WARNING: could not detect any out-of-order execution without using clEnqueueBarrierWithWaitList, so this test is not a valid test of out-of-order event dependencies.\n"); + log_info( + "WARNING: could not detect any out-of-order execution without " + "using clEnqueueBarrierWithWaitList, so this test is not a " + "valid test of out-of-order event dependencies.\n"); if (test_barrier) - log_info("WARNING: could not detect any out-of-order execution without using clEnqueueBarrierWithWaitList, so this test is not a valid test of out-of-order event dependencies.\n"); + log_info( + "WARNING: could not detect any out-of-order execution without " + "using clEnqueueBarrierWithWaitList, so this test is not a " + "valid test of out-of-order event dependencies.\n"); if (use_waitlists) - log_info("WARNING: could not detect any out-of-order execution without using waitlists, so this test is not a valid test of out-of-order event dependencies.\n"); + log_info("WARNING: could not detect any out-of-order execution " + "without using waitlists, so this test is not a valid " + "test of out-of-order event dependencies.\n"); if (use_marker) - log_info("WARNING: could not detect any out-of-order execution without using clEnqueueMarker, so this test is not a valid test of out-of-order event dependencies.\n"); - } else if (baseline_results == 1) { + log_info("WARNING: could not detect any out-of-order execution " + "without using clEnqueueMarker, so this test is not a " + "valid test of out-of-order event dependencies.\n"); + } + else if (baseline_results == 1) + { if (test_enqueue_wait_for_events) - log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueBarrierWithWaitList. Test can be a valid test of out-of-order event dependencies.\n"); + log_info("Detected incorrect execution (possibly out-of-order) " + "without clEnqueueBarrierWithWaitList. Test can be a " + "valid test of out-of-order event dependencies.\n"); if (test_barrier) - log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueBarrierWithWaitList. Test can be a valid test of out-of-order event dependencies.\n"); + log_info("Detected incorrect execution (possibly out-of-order) " + "without clEnqueueBarrierWithWaitList. Test can be a " + "valid test of out-of-order event dependencies.\n"); if (use_waitlists) - log_info("Detected incorrect execution (possibly out-of-order) without waitlists. Test can be a valid test of out-of-order event dependencies.\n"); + log_info("Detected incorrect execution (possibly out-of-order) " + "without waitlists. Test can be a valid test of " + "out-of-order event dependencies.\n"); if (use_marker) - log_info("Detected incorrect execution (possibly out-of-order) without clEnqueueMarker. Test can be a valid test of out-of-order event dependencies.\n"); - } else if( baseline_results == -1942 ) { + log_info("Detected incorrect execution (possibly out-of-order) " + "without clEnqueueMarker. Test can be a valid test of " + "out-of-order event dependencies.\n"); + } + else if (baseline_results == -1942) + { // Just ignore and return (out-of-order exec mode not supported) return 0; - } else { + } + else + { print_error(baseline_results, "Baseline run failed"); return baseline_results; } log_info("Running test for actual results...\n"); - return test_event_enqueue_wait_for_events_run_test(deviceID, context, queue, num_elements, two_queues, two_devices, - test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test_event_enqueue_wait_for_events_run_test( + deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); } -int test_out_of_order_event_waitlist_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_waitlist_single_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int two_queues = 0; int two_devices = 0; @@ -397,10 +543,15 @@ int test_out_of_order_event_waitlist_single_queue( cl_device_id deviceID, cl_con int test_barrier = 0; int use_waitlists = 1; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_waitlist_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_waitlist_multi_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 0; @@ -408,10 +559,14 @@ int test_out_of_order_event_waitlist_multi_queue( cl_device_id deviceID, cl_cont int test_barrier = 0; int use_waitlists = 1; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_waitlist_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_waitlist_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 1; @@ -419,11 +574,15 @@ int test_out_of_order_event_waitlist_multi_queue_multi_device( cl_device_id devi int test_barrier = 0; int use_waitlists = 1; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_wait_for_events_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_wait_for_events_single_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { int two_queues = 0; int two_devices = 0; @@ -431,10 +590,14 @@ int test_out_of_order_event_enqueue_wait_for_events_single_queue( cl_device_id d int test_barrier = 0; int use_waitlists = 0; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_wait_for_events_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_wait_for_events_multi_queue( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 0; @@ -442,11 +605,15 @@ int test_out_of_order_event_enqueue_wait_for_events_multi_queue( cl_device_id de int test_barrier = 0; int use_waitlists = 0; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 1; @@ -454,13 +621,16 @@ int test_out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device( cl int test_barrier = 0; int use_waitlists = 0; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } - - -int test_out_of_order_event_enqueue_barrier_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_barrier_single_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int two_queues = 0; int two_devices = 0; @@ -468,11 +638,16 @@ int test_out_of_order_event_enqueue_barrier_single_queue( cl_device_id deviceID, int test_barrier = 1; int use_waitlists = 0; int use_marker = 0; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_marker_single_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_marker_single_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int two_queues = 0; int two_devices = 0; @@ -480,10 +655,15 @@ int test_out_of_order_event_enqueue_marker_single_queue( cl_device_id deviceID, int test_barrier = 0; int use_waitlists = 0; int use_marker = 1; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_marker_multi_queue( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_marker_multi_queue(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 0; @@ -491,11 +671,15 @@ int test_out_of_order_event_enqueue_marker_multi_queue( cl_device_id deviceID, c int test_barrier = 0; int use_waitlists = 0; int use_marker = 1; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } -int test_out_of_order_event_enqueue_marker_multi_queue_multi_device( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_out_of_order_event_enqueue_marker_multi_queue_multi_device( + cl_device_id deviceID, cl_context context, cl_command_queue queue, + int num_elements) { int two_queues = 1; int two_devices = 1; @@ -503,7 +687,7 @@ int test_out_of_order_event_enqueue_marker_multi_queue_multi_device( cl_device_i int test_barrier = 0; int use_waitlists = 0; int use_marker = 1; - return test(deviceID, context, queue, num_elements, two_queues, two_devices, test_enqueue_wait_for_events, test_barrier, use_waitlists, use_marker); + return test(deviceID, context, queue, num_elements, two_queues, two_devices, + test_enqueue_wait_for_events, test_barrier, use_waitlists, + use_marker); } - - diff --git a/test_conformance/events/test_events.cpp b/test_conformance/events/test_events.cpp index 26693f99..34157fa0 100644 --- a/test_conformance/events/test_events.cpp +++ b/test_conformance/events/test_events.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -15,97 +15,112 @@ // #include "testBase.h" -#if ! defined( _WIN32 ) - #include "unistd.h" // for "sleep" used in the "while (1)" busy wait loop in +#if !defined(_WIN32) +#include "unistd.h" // for "sleep" used in the "while (1)" busy wait loop in #endif // test_event_flush const char *sample_long_test_kernel[] = { -"__kernel void sample_test(__global float *src, __global int *dst)\n" -"{\n" -" int tid = get_global_id(0);\n" -" int i;\n" -"\n" -" for( i = 0; i < 10000; i++ )\n" -" {\n" -" dst[tid] = (int)src[tid] * 3;\n" -" }\n" -"\n" -"}\n" }; - -int create_and_execute_kernel( cl_context inContext, cl_command_queue inQueue, cl_program *outProgram, cl_kernel *outKernel, cl_mem *streams, - unsigned int lineCount, const char **lines, const char *kernelName, cl_event *outEvent ) + "__kernel void sample_test(__global float *src, __global int *dst)\n" + "{\n" + " int tid = get_global_id(0);\n" + " int i;\n" + "\n" + " for( i = 0; i < 10000; i++ )\n" + " {\n" + " dst[tid] = (int)src[tid] * 3;\n" + " }\n" + "\n" + "}\n" +}; + +int create_and_execute_kernel(cl_context inContext, cl_command_queue inQueue, + cl_program *outProgram, cl_kernel *outKernel, + cl_mem *streams, unsigned int lineCount, + const char **lines, const char *kernelName, + cl_event *outEvent) { size_t threads[1] = { 1000 }, localThreads[1]; int error; - if( create_single_kernel_helper( inContext, outProgram, outKernel, lineCount, lines, kernelName ) ) + if (create_single_kernel_helper(inContext, outProgram, outKernel, lineCount, + lines, kernelName)) { return -1; } - error = get_max_common_work_group_size( inContext, *outKernel, threads[0], &localThreads[0] ); - test_error( error, "Unable to get work group size to use" ); + error = get_max_common_work_group_size(inContext, *outKernel, threads[0], + &localThreads[0]); + test_error(error, "Unable to get work group size to use"); streams[0] = clCreateBuffer(inContext, CL_MEM_READ_WRITE, sizeof(cl_float) * 1000, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); streams[1] = clCreateBuffer(inContext, CL_MEM_READ_WRITE, sizeof(cl_int) * 1000, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); /* Set the arguments */ - error = clSetKernelArg( *outKernel, 0, sizeof( streams[0] ), &streams[0] ); - test_error( error, "Unable to set kernel arguments" ); - error = clSetKernelArg( *outKernel, 1, sizeof( streams[1] ), &streams[1] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(*outKernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel arguments"); + error = clSetKernelArg(*outKernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set kernel arguments"); - error = clEnqueueNDRangeKernel(inQueue, *outKernel, 1, NULL, threads, localThreads, 0, NULL, outEvent); - test_error( error, "Unable to execute test kernel" ); + error = clEnqueueNDRangeKernel(inQueue, *outKernel, 1, NULL, threads, + localThreads, 0, NULL, outEvent); + test_error(error, "Unable to execute test kernel"); return 0; } -#define SETUP_EVENT( c, q ) \ -clProgramWrapper program; \ -clKernelWrapper kernel; \ -clMemWrapper streams[2]; \ -clEventWrapper event; \ -int error; \ -if( create_and_execute_kernel( c, q, &program, &kernel, &streams[0], 1, sample_long_test_kernel, "sample_test", &event ) ) return -1; +#define SETUP_EVENT(c, q) \ + clProgramWrapper program; \ + clKernelWrapper kernel; \ + clMemWrapper streams[2]; \ + clEventWrapper event; \ + int error; \ + if (create_and_execute_kernel(c, q, &program, &kernel, &streams[0], 1, \ + sample_long_test_kernel, "sample_test", \ + &event)) \ + return -1; #define FINISH_EVENT(_q) clFinish(_q) -const char *IGetStatusString( cl_int status ) +const char *IGetStatusString(cl_int status) { - static char tempString[ 128 ]; - switch( status ) + static char tempString[128]; + switch (status) { - case CL_COMPLETE: return "CL_COMPLETE"; - case CL_RUNNING: return "CL_RUNNING"; - case CL_QUEUED: return "CL_QUEUED"; - case CL_SUBMITTED: return "CL_SUBMITTED"; + case CL_COMPLETE: return "CL_COMPLETE"; + case CL_RUNNING: return "CL_RUNNING"; + case CL_QUEUED: return "CL_QUEUED"; + case CL_SUBMITTED: return "CL_SUBMITTED"; default: - sprintf( tempString, "<unknown: %d>", (int)status ); + sprintf(tempString, "<unknown: %d>", (int)status); return tempString; } } /* Note: tests clGetEventStatus and clReleaseEvent (implicitly) */ -int test_event_get_execute_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_get_execute_status(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int status; - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); /* Now wait for it to be done */ - error = clWaitForEvents( 1, &event ); - test_error( error, "Unable to wait for event" ); - - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus to wait for event completion failed" ); - if( status != CL_COMPLETE ) + error = clWaitForEvents(1, &event); + test_error(error, "Unable to wait for event"); + + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, + "Calling clGetEventStatus to wait for event completion failed"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after event complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } @@ -113,57 +128,75 @@ int test_event_get_execute_status( cl_device_id deviceID, cl_context context, cl return 0; } -int test_event_get_info( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_get_info(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); /* Verify parameters of clGetEventInfo not already tested by other tests */ cl_command_queue otherQueue; size_t size; - error = clGetEventInfo( event, CL_EVENT_COMMAND_QUEUE, sizeof( otherQueue ), &otherQueue, &size ); - test_error( error, "Unable to get event info!" ); - // We can not check if this is the right queue because this is an opaque object. - if( size != sizeof( queue ) ) + error = clGetEventInfo(event, CL_EVENT_COMMAND_QUEUE, sizeof(otherQueue), + &otherQueue, &size); + test_error(error, "Unable to get event info!"); + // We can not check if this is the right queue because this is an opaque + // object. + if (size != sizeof(queue)) { - log_error( "ERROR: Returned command queue size does not validate (expected %d, got %d)\n", (int)sizeof( queue ), (int)size ); + log_error("ERROR: Returned command queue size does not validate " + "(expected %d, got %d)\n", + (int)sizeof(queue), (int)size); return -1; } cl_command_type type; - error = clGetEventInfo( event, CL_EVENT_COMMAND_TYPE, sizeof( type ), &type, &size ); - test_error( error, "Unable to get event info!" ); - if( type != CL_COMMAND_NDRANGE_KERNEL ) + error = clGetEventInfo(event, CL_EVENT_COMMAND_TYPE, sizeof(type), &type, + &size); + test_error(error, "Unable to get event info!"); + if (type != CL_COMMAND_NDRANGE_KERNEL) { - log_error( "ERROR: Returned command type does not validate (expected %d, got %d)\n", (int)CL_COMMAND_NDRANGE_KERNEL, (int)type ); + log_error("ERROR: Returned command type does not validate (expected " + "%d, got %d)\n", + (int)CL_COMMAND_NDRANGE_KERNEL, (int)type); return -1; } - if( size != sizeof( type ) ) + if (size != sizeof(type)) { - log_error( "ERROR: Returned command type size does not validate (expected %d, got %d)\n", (int)sizeof( type ), (int)size ); + log_error("ERROR: Returned command type size does not validate " + "(expected %d, got %d)\n", + (int)sizeof(type), (int)size); return -1; } cl_uint count; - error = clGetEventInfo( event, CL_EVENT_REFERENCE_COUNT, sizeof( count ), &count, &size ); - test_error( error, "Unable to get event info for CL_EVENT_REFERENCE_COUNT!" ); - if( size != sizeof( count ) ) + error = clGetEventInfo(event, CL_EVENT_REFERENCE_COUNT, sizeof(count), + &count, &size); + test_error(error, "Unable to get event info for CL_EVENT_REFERENCE_COUNT!"); + if (size != sizeof(count)) { - log_error( "ERROR: Returned command type size does not validate (expected %d, got %d)\n", (int)sizeof( type ), (int)size ); + log_error("ERROR: Returned command type size does not validate " + "(expected %d, got %d)\n", + (int)sizeof(type), (int)size); return -1; } cl_context testCtx; - error = clGetEventInfo( event, CL_EVENT_CONTEXT, sizeof( testCtx ), &testCtx, &size ); - test_error( error, "Unable to get event context info!" ); - if( size != sizeof( context ) ) + error = clGetEventInfo(event, CL_EVENT_CONTEXT, sizeof(testCtx), &testCtx, + &size); + test_error(error, "Unable to get event context info!"); + if (size != sizeof(context)) { - log_error( "ERROR: Returned context size does not validate (expected %d, got %d)\n", (int)sizeof( context ), (int)size ); + log_error("ERROR: Returned context size does not validate (expected " + "%d, got %d)\n", + (int)sizeof(context), (int)size); return -1; } - if( testCtx != context ) + if (testCtx != context) { - log_error( "ERROR: Returned context does not match (expected %p, got %p)\n", (void *)context, (void *)testCtx ); + log_error( + "ERROR: Returned context does not match (expected %p, got %p)\n", + (void *)context, (void *)testCtx); return -1; } @@ -171,10 +204,11 @@ int test_event_get_info( cl_device_id deviceID, cl_context context, cl_command_q return 0; } -int test_event_get_write_array_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_get_write_array_status(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_mem stream; - cl_float testArray[ 1024 * 32 ]; + cl_float testArray[1024 * 32]; cl_event event; int error; cl_int status; @@ -182,34 +216,41 @@ int test_event_get_write_array_status( cl_device_id deviceID, cl_context context stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); - error = clEnqueueWriteBuffer(queue, stream, CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)testArray, 0, NULL, &event); - test_error( error, "Unable to set testing kernel data" ); + error = clEnqueueWriteBuffer(queue, stream, CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, + (void *)testArray, 0, NULL, &event); + test_error(error, "Unable to set testing kernel data"); /* Now wait for it to be done */ - error = clWaitForEvents( 1, &event ); - test_error( error, "Unable to wait for event" ); - - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus to wait for event completion failed" ); - if( status != CL_COMPLETE ) + error = clWaitForEvents(1, &event); + test_error(error, "Unable to wait for event"); + + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, + "Calling clGetEventStatus to wait for event completion failed"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array write complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - clReleaseMemObject( stream ); - clReleaseEvent( event ); + clReleaseMemObject(stream); + clReleaseEvent(event); return 0; } -int test_event_get_read_array_status( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_get_read_array_status(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_mem stream; - cl_float testArray[ 1024 * 32 ]; + cl_float testArray[1024 * 32]; cl_event event; int error; cl_int status; @@ -217,58 +258,72 @@ int test_event_get_read_array_status( cl_device_id deviceID, cl_context context, stream = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); - error = clEnqueueReadBuffer(queue, stream, CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)testArray, 0, NULL, &event); - test_error( error, "Unable to get testing kernel data" ); + error = clEnqueueReadBuffer(queue, stream, CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, (void *)testArray, + 0, NULL, &event); + test_error(error, "Unable to get testing kernel data"); /* It should still be running... */ - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); - if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE) + if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED + && status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "during array read (%d:%s)\n", + status, IGetStatusString(status)); return -1; } /* Now wait for it to be done */ - error = clWaitForEvents( 1, &event ); - test_error( error, "Unable to wait for event" ); - - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus to wait for event completion failed" ); - if( status != CL_COMPLETE ) + error = clWaitForEvents(1, &event); + test_error(error, "Unable to wait for event"); + + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, + "Calling clGetEventStatus to wait for event completion failed"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array read complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - clReleaseMemObject( stream ); - clReleaseEvent( event ); + clReleaseMemObject(stream); + clReleaseEvent(event); return 0; } /* clGetEventStatus not implemented yet */ -int test_event_wait_for_execute( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_wait_for_execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int status; - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); /* Now we wait for it to be done, then test the status again */ - error = clWaitForEvents( 1, &event ); - test_error( error, "Unable to wait for execute event" ); + error = clWaitForEvents(1, &event); + test_error(error, "Unable to wait for execute event"); /* Make sure it worked */ - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after event complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } @@ -276,11 +331,12 @@ int test_event_wait_for_execute( cl_device_id deviceID, cl_context context, cl_c return 0; } -int test_event_wait_for_array( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_wait_for_array(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_mem streams[2]; - cl_float readArray[ 1024 * 32 ]; - cl_float writeArray[ 1024 * 32 ]; + cl_float readArray[1024 * 32]; + cl_float writeArray[1024 * 32]; cl_event events[2]; int error; cl_int status; @@ -288,128 +344,155 @@ int test_event_wait_for_array( cl_device_id deviceID, cl_context context, cl_com streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); - error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)readArray, 0, NULL, &events[0]); - test_error( error, "Unable to read testing kernel data" ); + error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, (void *)readArray, + 0, NULL, &events[0]); + test_error(error, "Unable to read testing kernel data"); - error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)writeArray, 0, NULL, &events[1]); - test_error( error, "Unable to write testing kernel data" ); + error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, + (void *)writeArray, 0, NULL, &events[1]); + test_error(error, "Unable to write testing kernel data"); /* Both should still be running */ - error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE) + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED + && status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "during array read (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE) + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED + && status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array write (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "during array write (%d:%s)\n", + status, IGetStatusString(status)); return -1; } /* Now try waiting for both */ - error = clWaitForEvents( 2, events ); - test_error( error, "Unable to wait for array events" ); + error = clWaitForEvents(2, events); + test_error(error, "Unable to wait for array events"); /* Double check status on both */ - error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array read complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array write complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - clReleaseMemObject( streams[0] ); - clReleaseMemObject( streams[1] ); - clReleaseEvent( events[0] ); - clReleaseEvent( events[1] ); + clReleaseMemObject(streams[0]); + clReleaseMemObject(streams[1]); + clReleaseEvent(events[0]); + clReleaseEvent(events[1]); return 0; } -int test_event_flush( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_flush(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { int loopCount = 0; cl_int status; - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); - /* Now flush. Note that we can't guarantee this actually lets the op finish, but we can guarantee it's no longer queued */ - error = clFlush( queue ); - test_error( error, "Unable to flush events" ); + /* Now flush. Note that we can't guarantee this actually lets the op finish, + * but we can guarantee it's no longer queued */ + error = clFlush(queue); + test_error(error, "Unable to flush events"); /* Make sure it worked */ - while (1) { - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, - sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); + while (1) + { + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); - if( status != CL_QUEUED ) - break; + if (status != CL_QUEUED) break; -#if ! defined( _WIN32 ) +#if !defined(_WIN32) sleep(1); // give it some time here. #else // _WIN32 - Sleep(1000); + Sleep(1000); #endif ++loopCount; - } - -/* -CL_QUEUED (command has been enqueued in the command-queue), -CL_SUBMITTED (enqueued command has been submitted by the host to the device associated with the command-queue), -CL_RUNNING (device is currently executing this command), -CL_COMPLETE (the command has completed), or -Error code given by a negative integer value. (command was abnormally terminated – this may be caused by a bad memory access etc.). -*/ - if(status != CL_COMPLETE && status != CL_SUBMITTED && - status != CL_RUNNING && status != CL_COMPLETE) - { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event flush (%d:%s)\n", status, IGetStatusString( status ) ); + } + + /* + CL_QUEUED (command has been enqueued in the command-queue), + CL_SUBMITTED (enqueued command has been submitted by the host to the device + associated with the command-queue), CL_RUNNING (device is currently + executing this command), CL_COMPLETE (the command has completed), or Error + code given by a negative integer value. (command was abnormally terminated – + this may be caused by a bad memory access etc.). + */ + if (status != CL_COMPLETE && status != CL_SUBMITTED && status != CL_RUNNING + && status != CL_COMPLETE) + { + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after event flush (%d:%s)\n", + status, IGetStatusString(status)); return -1; } /* Now wait */ - error = clFinish( queue ); - test_error( error, "Unable to finish events" ); + error = clFinish(queue); + test_error(error, "Unable to finish events"); FINISH_EVENT(queue); return 0; } -int test_event_finish_execute( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_finish_execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int status; - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); /* Now flush and finish all ops */ - error = clFinish( queue ); - test_error( error, "Unable to finish all events" ); + error = clFinish(queue); + test_error(error, "Unable to finish all events"); /* Make sure it worked */ - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after event complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after event complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } @@ -417,11 +500,12 @@ int test_event_finish_execute( cl_device_id deviceID, cl_context context, cl_com return 0; } -int test_event_finish_array( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_finish_array(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_mem streams[2]; - cl_float readArray[ 1024 * 32 ]; - cl_float writeArray[ 1024 * 32 ]; + cl_float readArray[1024 * 32]; + cl_float writeArray[1024 * 32]; cl_event events[2]; int error; cl_int status; @@ -429,59 +513,77 @@ int test_event_finish_array( cl_device_id deviceID, cl_context context, cl_comma streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * 1024 * 32, NULL, &error); - test_error( error, "Creating test array failed" ); + test_error(error, "Creating test array failed"); - error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)readArray, 0, NULL, &events[0]); - test_error( error, "Unable to read testing kernel data" ); + error = clEnqueueReadBuffer(queue, streams[0], CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, (void *)readArray, + 0, NULL, &events[0]); + test_error(error, "Unable to read testing kernel data"); - error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, sizeof(cl_float)*1024*32, (void *)writeArray, 0, NULL, &events[1]); - test_error( error, "Unable to write testing kernel data" ); + error = clEnqueueWriteBuffer(queue, streams[1], CL_FALSE, 0, + sizeof(cl_float) * 1024 * 32, + (void *)writeArray, 0, NULL, &events[1]); + test_error(error, "Unable to write testing kernel data"); /* Both should still be running */ - error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE) + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED + && status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array read (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "during array read (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED && status != CL_COMPLETE) + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_RUNNING && status != CL_QUEUED && status != CL_SUBMITTED + && status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus during array write (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "during array write (%d:%s)\n", + status, IGetStatusString(status)); return -1; } /* Now try finishing all ops */ - error = clFinish( queue ); - test_error( error, "Unable to finish all events" ); + error = clFinish(queue); + test_error(error, "Unable to finish all events"); /* Double check status on both */ - error = clGetEventInfo( events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array read complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array read complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - error = clGetEventInfo( events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventStatus didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventStatus didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetErrorStatus after array write complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetErrorStatus " + "after array write complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } - clReleaseMemObject( streams[0] ); - clReleaseMemObject( streams[1] ); - clReleaseEvent( events[0] ); - clReleaseEvent( events[1] ); + clReleaseMemObject(streams[0]); + clReleaseMemObject(streams[1]); + clReleaseEvent(events[0]); + clReleaseEvent(events[1]); return 0; } @@ -489,7 +591,8 @@ int test_event_finish_array( cl_device_id deviceID, cl_context context, cl_comma #define NUM_EVENT_RUNS 100 -int test_event_release_before_done( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_release_before_done(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { // Create a kernel to run clProgramWrapper program; @@ -501,21 +604,24 @@ int test_event_release_before_done( cl_device_id deviceID, cl_context context, c int error, i; // Create a kernel - if( create_single_kernel_helper( context, &program, &kernel[0], 1, sample_long_test_kernel, "sample_test" ) ) + if (create_single_kernel_helper(context, &program, &kernel[0], 1, + sample_long_test_kernel, "sample_test")) { return -1; } - for( i = 1; i < NUM_EVENT_RUNS; i++ ) { - kernel[i] = clCreateKernel(program, "sample_test", &error); - test_error(error, "Unable to create kernel"); - } + for (i = 1; i < NUM_EVENT_RUNS; i++) + { + kernel[i] = clCreateKernel(program, "sample_test", &error); + test_error(error, "Unable to create kernel"); + } - error = get_max_common_work_group_size( context, kernel[0], 1024, &threads[0] ); - test_error( error, "Unable to get work group size to use" ); + error = + get_max_common_work_group_size(context, kernel[0], 1024, &threads[0]); + test_error(error, "Unable to get work group size to use"); // Create a set of streams to use as arguments - for( i = 0; i < NUM_EVENT_RUNS; i++ ) + for (i = 0; i < NUM_EVENT_RUNS; i++) { streams[i][0] = clCreateBuffer(context, CL_MEM_READ_WRITE, @@ -523,77 +629,89 @@ int test_event_release_before_done( cl_device_id deviceID, cl_context context, c streams[i][1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * threads[0], NULL, &error); - if( ( streams[i][0] == NULL ) || ( streams[i][1] == NULL ) ) + if ((streams[i][0] == NULL) || (streams[i][1] == NULL)) { - log_error( "ERROR: Unable to allocate testing streams" ); + log_error("ERROR: Unable to allocate testing streams"); return -1; } } - // Execute the kernels one by one, hopefully making sure they won't be done by the time we get to the end - for( i = 0; i < NUM_EVENT_RUNS; i++ ) + // Execute the kernels one by one, hopefully making sure they won't be done + // by the time we get to the end + for (i = 0; i < NUM_EVENT_RUNS; i++) { - error = clSetKernelArg( kernel[i], 0, sizeof( cl_mem ), &streams[i][0] ); - error |= clSetKernelArg( kernel[i], 1, sizeof( cl_mem ), &streams[i][1] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &streams[i][0]); + error |= clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &streams[i][1]); + test_error(error, "Unable to set kernel arguments"); - error = clEnqueueNDRangeKernel( queue, kernel[i], 1, NULL, threads, threads, 0, NULL, &events[i]); - test_error( error, "Unable to execute test kernel" ); + error = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, threads, + threads, 0, NULL, &events[i]); + test_error(error, "Unable to execute test kernel"); } // Free all but the last event - for( i = 0; i < NUM_EVENT_RUNS - 1; i++ ) + for (i = 0; i < NUM_EVENT_RUNS - 1; i++) { - clReleaseEvent( events[ i ] ); + clReleaseEvent(events[i]); } // Get status on the last one, then free it - error = clGetEventInfo( events[ NUM_EVENT_RUNS - 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Unable to get event status" ); + error = clGetEventInfo(events[NUM_EVENT_RUNS - 1], + CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), + &status, NULL); + test_error(error, "Unable to get event status"); - clReleaseEvent( events[ NUM_EVENT_RUNS - 1 ] ); + clReleaseEvent(events[NUM_EVENT_RUNS - 1]); // Was the status still-running? - if( status == CL_COMPLETE ) + if (status == CL_COMPLETE) { - log_info( "WARNING: Events completed before they could be released, so test is a null-op. Increase workload and try again." ); + log_info("WARNING: Events completed before they could be released, so " + "test is a null-op. Increase workload and try again."); } - else if( status == CL_RUNNING || status == CL_QUEUED || status == CL_SUBMITTED ) + else if (status == CL_RUNNING || status == CL_QUEUED + || status == CL_SUBMITTED) { - log_info( "Note: Event status was running or queued when released, so test was good.\n" ); + log_info("Note: Event status was running or queued when released, so " + "test was good.\n"); } // If we didn't crash by now, the test succeeded - clFinish( queue ); + clFinish(queue); return 0; } -int test_event_enqueue_marker( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_enqueue_marker(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int status; - SETUP_EVENT( context, queue ); + SETUP_EVENT(context, queue); - /* Now we queue a marker and wait for that, which--since it queues afterwards--should guarantee the execute finishes too */ + /* Now we queue a marker and wait for that, which--since it queues + * afterwards--should guarantee the execute finishes too */ clEventWrapper markerEvent; - //error = clEnqueueMarker( queue, &markerEvent ); + // error = clEnqueueMarker( queue, &markerEvent ); #ifdef CL_VERSION_1_2 - error = clEnqueueMarkerWithWaitList(queue, 0, NULL, &markerEvent ); + error = clEnqueueMarkerWithWaitList(queue, 0, NULL, &markerEvent); #else - error = clEnqueueMarker( queue, &markerEvent ); + error = clEnqueueMarker(queue, &markerEvent); #endif - test_error( error, "Unable to queue marker" ); + test_error(error, "Unable to queue marker"); /* Now we wait for it to be done, then test the status again */ - error = clWaitForEvents( 1, &markerEvent ); - test_error( error, "Unable to wait for marker event" ); + error = clWaitForEvents(1, &markerEvent); + test_error(error, "Unable to wait for marker event"); /* Check the status of the first event */ - error = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status ), &status, NULL ); - test_error( error, "Calling clGetEventInfo didn't work!" ); - if( status != CL_COMPLETE ) + error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status), &status, NULL); + test_error(error, "Calling clGetEventInfo didn't work!"); + if (status != CL_COMPLETE) { - log_error( "ERROR: Incorrect status returned from clGetEventInfo after event complete (%d:%s)\n", status, IGetStatusString( status ) ); + log_error("ERROR: Incorrect status returned from clGetEventInfo after " + "event complete (%d:%s)\n", + status, IGetStatusString(status)); return -1; } @@ -602,85 +720,101 @@ int test_event_enqueue_marker( cl_device_id deviceID, cl_context context, cl_com } #ifdef CL_VERSION_1_2 -int test_event_enqueue_marker_with_event_list( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_enqueue_marker_with_event_list(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { + SETUP_EVENT(context, queue); + cl_event event_list[3] = { NULL, NULL, NULL }; - cl_int status; - SETUP_EVENT( context, queue ); - cl_event event_list[3]={ NULL, NULL, NULL}; + size_t threads[1] = { 10 }, localThreads[1] = { 1 }; + cl_uint event_count = 2; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[0]); + test_error(error, " clEnqueueMarkerWithWaitList 1 "); - size_t threads[1] = { 10 }, localThreads[1]={1}; - cl_uint event_count=2; - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]); - test_error( error, " clEnqueueMarkerWithWaitList 1 " ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[1]); + test_error(error, " clEnqueueMarkerWithWaitList 2"); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]); - test_error( error, " clEnqueueMarkerWithWaitList 2" ); - - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, NULL); - test_error( error, " clEnqueueMarkerWithWaitList 3" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, " clEnqueueMarkerWithWaitList 3"); // test the case event returned - error =clEnqueueMarkerWithWaitList(queue, event_count, event_list, &event_list[2]); - test_error( error, " clEnqueueMarkerWithWaitList " ); + error = clEnqueueMarkerWithWaitList(queue, event_count, event_list, + &event_list[2]); + test_error(error, " clEnqueueMarkerWithWaitList "); error = clReleaseEvent(event_list[0]); error |= clReleaseEvent(event_list[1]); - test_error( error, "clReleaseEvent" ); + test_error(error, "clReleaseEvent"); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]); - test_error( error, " clEnqueueMarkerWithWaitList 1 -1 " ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[0]); + test_error(error, " clEnqueueMarkerWithWaitList 1 -1 "); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]); - test_error( error, " clEnqueueMarkerWithWaitList 2-2" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[1]); + test_error(error, " clEnqueueMarkerWithWaitList 2-2"); - // test the case event =NULL, caused [CL_INVALID_VALUE] : OpenCL Error : clEnqueueMarkerWithWaitList failed: event is a NULL value - error =clEnqueueMarkerWithWaitList(queue, event_count, event_list, NULL); - test_error( error, " clEnqueueMarkerWithWaitList " ); + // test the case event =NULL, caused [CL_INVALID_VALUE] : OpenCL Error : + // clEnqueueMarkerWithWaitList failed: event is a NULL value + error = clEnqueueMarkerWithWaitList(queue, event_count, event_list, NULL); + test_error(error, " clEnqueueMarkerWithWaitList "); error = clReleaseEvent(event_list[0]); error |= clReleaseEvent(event_list[1]); error |= clReleaseEvent(event_list[2]); - test_error( error, "clReleaseEvent" ); + test_error(error, "clReleaseEvent"); FINISH_EVENT(queue); return 0; } -int test_event_enqueue_barrier_with_event_list( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) +int test_event_enqueue_barrier_with_event_list(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + int num_elements) { + SETUP_EVENT(context, queue); + cl_event event_list[3] = { NULL, NULL, NULL }; - cl_int status; - SETUP_EVENT( context, queue ); - cl_event event_list[3]={ NULL, NULL, NULL}; - - size_t threads[1] = { 10 }, localThreads[1]={1}; - cl_uint event_count=2; - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]); - test_error( error, " clEnqueueBarrierWithWaitList 1 " ); + size_t threads[1] = { 10 }, localThreads[1] = { 1 }; + cl_uint event_count = 2; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[0]); + test_error(error, " clEnqueueBarrierWithWaitList 1 "); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]); - test_error( error, " clEnqueueBarrierWithWaitList 2" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[1]); + test_error(error, " clEnqueueBarrierWithWaitList 2"); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, NULL); - test_error( error, " clEnqueueBarrierWithWaitList 20" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, " clEnqueueBarrierWithWaitList 20"); // test the case event returned - error =clEnqueueBarrierWithWaitList(queue, event_count, event_list, &event_list[2]); - test_error( error, " clEnqueueBarrierWithWaitList " ); + error = clEnqueueBarrierWithWaitList(queue, event_count, event_list, + &event_list[2]); + test_error(error, " clEnqueueBarrierWithWaitList "); clReleaseEvent(event_list[0]); clReleaseEvent(event_list[1]); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[0]); - test_error( error, " clEnqueueBarrierWithWaitList 1 " ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[0]); + test_error(error, " clEnqueueBarrierWithWaitList 1 "); - error= clEnqueueNDRangeKernel( queue,kernel,1,NULL, threads, localThreads, 0, NULL, &event_list[1]); - test_error( error, " clEnqueueBarrierWithWaitList 2" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, &event_list[1]); + test_error(error, " clEnqueueBarrierWithWaitList 2"); - // test the case event =NULL, caused [CL_INVALID_VALUE] : OpenCL Error : clEnqueueMarkerWithWaitList failed: event is a NULL value - error = clEnqueueBarrierWithWaitList(queue, event_count, event_list, NULL); - test_error( error, " clEnqueueBarrierWithWaitList " ); + // test the case event =NULL, caused [CL_INVALID_VALUE] : OpenCL Error : + // clEnqueueMarkerWithWaitList failed: event is a NULL value + error = clEnqueueBarrierWithWaitList(queue, event_count, event_list, NULL); + test_error(error, " clEnqueueBarrierWithWaitList "); clReleaseEvent(event_list[0]); clReleaseEvent(event_list[1]); diff --git a/test_conformance/events/test_userevents.cpp b/test_conformance/events/test_userevents.cpp index 0a4954f9..1fdb4ea4 100644 --- a/test_conformance/events/test_userevents.cpp +++ b/test_conformance/events/test_userevents.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -14,11 +14,11 @@ // limitations under the License. // #if defined(__APPLE__) - #include <OpenCL/opencl.h> - #include <mach/mach_time.h> +#include <OpenCL/opencl.h> +#include <mach/mach_time.h> #else - #include <CL/cl.h> - #include <malloc.h> +#include <CL/cl.h> +#include <malloc.h> #endif #include <assert.h> #include <stdio.h> @@ -29,189 +29,261 @@ // CL error checking. #if defined(_MSC_VER) -#define CL_EXIT_ERROR(cmd,...) \ -{ \ -if ((cmd) != CL_SUCCESS) { \ -log_error("CL ERROR: %s %u: ", __FILE__,__LINE__);\ -log_error(## __VA_ARGS__ );\ -log_error("\n");\ -return -1;\ -}\ -} +#define CL_EXIT_ERROR(cmd, ...) \ + { \ + if ((cmd) != CL_SUCCESS) \ + { \ + log_error("CL ERROR: %s %u: ", __FILE__, __LINE__); \ + log_error(##__VA_ARGS__); \ + log_error("\n"); \ + return -1; \ + } \ + } #else -#define CL_EXIT_ERROR(cmd,format,...) \ -{ \ -if ((cmd) != CL_SUCCESS) { \ -log_error("CL ERROR: %s %u: ", __FILE__,__LINE__);\ -log_error(format,## __VA_ARGS__ );\ -log_error("\n");\ -return -1;\ -}\ -} -#endif - -#define CL_EXIT_BUILD_ERROR(cmd,program,format,...) \ -{ \ -if ((cmd) != CL_SUCCESS) { \ -cl_uint num_devices_;\ -clGetProgramInfo(program,CL_PROGRAM_NUM_DEVICES,sizeof(num_devices_),&num_devices_,NULL);\ -cl_device_id *device_list;\ -device_list=(cl_device_id *)malloc(num_devices_*sizeof(cl_device_id));\ -clGetProgramInfo(program,CL_PROGRAM_DEVICES,num_devices_*sizeof(cl_device_id),device_list,NULL);\ -for (unsigned i=0;i<num_devices_;++i) {\ -size_t len;\ -char buffer[2048];\ -clGetProgramBuildInfo(program,device_list[i],CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);\ -log_error("DEVICE %u CL BUILD ERROR: %s(%u): ",i,__FILE__,__LINE__);\ -log_error(format,## __VA_ARGS__ );\ -log_error("\n");\ -}\ -free(device_list);\ -return -1;\ -}\ -} - -const char* src[] = { - "__kernel void simple_task(__global float* output) {\n" - " output[0] += 1;\n" - "}\n" -}; - -enum { MaxDevices = 8 }; - -int test_userevents( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) -{ - - cl_int err; - - cl_event u1 = clCreateUserEvent( context, &err ); - CL_EXIT_ERROR(err,"clCreateUserEvent failed"); - - // Test event properties. - cl_int s; - size_t sizeofs; - CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof s, &s, &sizeofs),"clGetEventInfo failed"); - CL_EXIT_ERROR((sizeof s == sizeofs) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_EXECUTION_STATUS"); - CL_EXIT_ERROR((s == CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_EXECUTION_STATUS"); - - cl_command_type t; - size_t sizeoft; - CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_TYPE, sizeof t, &t, &sizeoft),"clGetEventInfo failed"); - CL_EXIT_ERROR((sizeof t == sizeoft) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_TYPE"); - CL_EXIT_ERROR((t == CL_COMMAND_USER) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_TYPE"); - - cl_command_queue q; - size_t sizeofq; - CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_QUEUE, sizeof q, &q, &sizeofq),"clGetEventInfo failed"); - CL_EXIT_ERROR((sizeof q == sizeofq) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_COMMAND_QUEUE"); - CL_EXIT_ERROR((q == NULL) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_COMMAND_QUEUE"); - - cl_context c; - size_t sizeofc; - CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_CONTEXT, sizeof c, &c, &sizeofc),"clGetEventInfo failed"); - CL_EXIT_ERROR((sizeof c == sizeofc) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong size for CL_EVENT_CONTEXT"); - CL_EXIT_ERROR((c == context) ? CL_SUCCESS : -1,"clGetEventInfo returned wrong value for CL_EVENT_CONTEXT"); - - cl_ulong p; - err = clGetEventProfilingInfo(u1,CL_PROFILING_COMMAND_QUEUED,sizeof p,&p,0); - CL_EXIT_ERROR((err != CL_SUCCESS) ? CL_SUCCESS : -1,"clGetEventProfilingInfo returned wrong error."); - - // Test semantics. - cl_program program; - err = create_single_kernel_helper_create_program(context, &program, 1, src); - CL_EXIT_ERROR(err,"clCreateProgramWithSource failed"); - - CL_EXIT_BUILD_ERROR(clBuildProgram(program,0,NULL,"",NULL,NULL),program,"Building program from inline src:\t%s",src[0]); - - cl_kernel k0 = clCreateKernel(program,"simple_task",&err); - CL_EXIT_ERROR(err,"clCreateKernel failed"); - - float buffer[1]; - cl_mem output = clCreateBuffer(context,CL_MEM_USE_HOST_PTR,sizeof buffer, buffer, &err); - CL_EXIT_ERROR(err,"clCreateBuffer failed."); - - CL_EXIT_ERROR(clSetKernelArg(k0,0,sizeof(output),&output),"clSetKernelArg failed"); - - - // Successful case. ////////////////////////////////////////////////////////////////////////////////////// - { - cl_event e[4]; - cl_uint N = sizeof e / sizeof(cl_event); - - log_info("Enqueuing tasks\n"); - for (cl_uint i = 0; i != N; ++i) - CL_EXIT_ERROR(clEnqueueTask(queue,k0,1,&u1,&e[i]),"clEnqueueTaskFailed"); - - log_info("Checking task status before setting user event status\n"); - for (cl_uint i = 0; i != N; ++i) { - CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed"); - CL_EXIT_ERROR((s >= CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status before user event",i); +#define CL_EXIT_ERROR(cmd, format, ...) \ + { \ + if ((cmd) != CL_SUCCESS) \ + { \ + log_error("CL ERROR: %s %u: ", __FILE__, __LINE__); \ + log_error(format, ##__VA_ARGS__); \ + log_error("\n"); \ + return -1; \ + } \ } +#endif - log_info("Setting user event status to complete\n"); - CL_EXIT_ERROR(clSetUserEventStatus(u1,CL_COMPLETE),"clSetUserEventStatus failed"); - - log_info("Waiting for tasks to finish executing\n"); - CL_EXIT_ERROR(clWaitForEvents( 1, &e[N-1] ),"clWaitForEvent failed"); - - log_info("Checking task status after setting user event status\n"); - for (cl_uint i = 0; i != N; ++i) { - CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed"); - CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %04x after successful user event",i,s); +#define CL_EXIT_BUILD_ERROR(cmd, program, format, ...) \ + { \ + if ((cmd) != CL_SUCCESS) \ + { \ + cl_uint num_devices_; \ + clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, \ + sizeof(num_devices_), &num_devices_, NULL); \ + cl_device_id *device_list; \ + device_list = \ + (cl_device_id *)malloc(num_devices_ * sizeof(cl_device_id)); \ + clGetProgramInfo(program, CL_PROGRAM_DEVICES, \ + num_devices_ * sizeof(cl_device_id), device_list, \ + NULL); \ + for (unsigned i = 0; i < num_devices_; ++i) \ + { \ + size_t len; \ + char buffer[2048]; \ + clGetProgramBuildInfo(program, device_list[i], \ + CL_PROGRAM_BUILD_LOG, sizeof(buffer), \ + buffer, &len); \ + log_error("DEVICE %u CL BUILD ERROR: %s(%u): ", i, __FILE__, \ + __LINE__); \ + log_error(format, ##__VA_ARGS__); \ + log_error("\n"); \ + } \ + free(device_list); \ + return -1; \ + } \ } - CL_EXIT_ERROR(clReleaseEvent(u1),"clReleaseEvent failed"); - - for (cl_uint i = 0; i != N; ++i) - CL_EXIT_ERROR(clReleaseEvent(e[i]),"clReleaseEvent failed"); - - log_info("Successful user event case passed.\n"); - - } +const char *src[] = { "__kernel void simple_task(__global float* output) {\n" + " output[0] += 1;\n" + "}\n" }; - // Test unsuccessful user event case. /////////////////////////////////////////////////////////////////// - { - cl_event u2 = clCreateUserEvent( context, &err ); - CL_EXIT_ERROR(err,"clCreateUserEvent failed"); - - cl_event e[4]; - cl_uint N = sizeof e / sizeof(cl_event); +enum +{ + MaxDevices = 8 +}; - log_info("Enqueuing tasks\n"); - for (cl_uint i = 0; i != N; ++i) - CL_EXIT_ERROR(clEnqueueTask(queue,k0,1,&u2,&e[i]),"clEnqueueTaskFailed"); +int test_userevents(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ - log_info("Checking task status before setting user event status\n"); - for (cl_uint i = 0; i != N; ++i) { - CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed"); - CL_EXIT_ERROR((s == CL_QUEUED || s == CL_SUBMITTED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %d before user event",i, (int) s); + cl_int err; + + cl_event u1 = clCreateUserEvent(context, &err); + CL_EXIT_ERROR(err, "clCreateUserEvent failed"); + + // Test event properties. + cl_int s; + size_t sizeofs; + CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof s, &s, &sizeofs), + "clGetEventInfo failed"); + CL_EXIT_ERROR((sizeof s == sizeofs) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong size for " + "CL_EVENT_COMMAND_EXECUTION_STATUS"); + CL_EXIT_ERROR((s == CL_SUBMITTED) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong value for " + "CL_EVENT_COMMAND_EXECUTION_STATUS"); + + cl_command_type t; + size_t sizeoft; + CL_EXIT_ERROR( + clGetEventInfo(u1, CL_EVENT_COMMAND_TYPE, sizeof t, &t, &sizeoft), + "clGetEventInfo failed"); + CL_EXIT_ERROR( + (sizeof t == sizeoft) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong size for CL_EVENT_COMMAND_TYPE"); + CL_EXIT_ERROR( + (t == CL_COMMAND_USER) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong value for CL_EVENT_COMMAND_TYPE"); + + cl_command_queue q; + size_t sizeofq; + CL_EXIT_ERROR( + clGetEventInfo(u1, CL_EVENT_COMMAND_QUEUE, sizeof q, &q, &sizeofq), + "clGetEventInfo failed"); + CL_EXIT_ERROR( + (sizeof q == sizeofq) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong size for CL_EVENT_COMMAND_QUEUE"); + CL_EXIT_ERROR( + (q == NULL) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong value for CL_EVENT_COMMAND_QUEUE"); + + cl_context c; + size_t sizeofc; + CL_EXIT_ERROR(clGetEventInfo(u1, CL_EVENT_CONTEXT, sizeof c, &c, &sizeofc), + "clGetEventInfo failed"); + CL_EXIT_ERROR((sizeof c == sizeofc) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong size for CL_EVENT_CONTEXT"); + CL_EXIT_ERROR((c == context) ? CL_SUCCESS : -1, + "clGetEventInfo returned wrong value for CL_EVENT_CONTEXT"); + + cl_ulong p; + err = clGetEventProfilingInfo(u1, CL_PROFILING_COMMAND_QUEUED, sizeof p, &p, + 0); + CL_EXIT_ERROR((err != CL_SUCCESS) ? CL_SUCCESS : -1, + "clGetEventProfilingInfo returned wrong error."); + + // Test semantics. + cl_program program; + err = create_single_kernel_helper_create_program(context, &program, 1, src); + CL_EXIT_ERROR(err, "clCreateProgramWithSource failed"); + + CL_EXIT_BUILD_ERROR(clBuildProgram(program, 0, NULL, "", NULL, NULL), + program, "Building program from inline src:\t%s", + src[0]); + + cl_kernel k0 = clCreateKernel(program, "simple_task", &err); + CL_EXIT_ERROR(err, "clCreateKernel failed"); + + float buffer[1]; + cl_mem output = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof buffer, + buffer, &err); + CL_EXIT_ERROR(err, "clCreateBuffer failed."); + + CL_EXIT_ERROR(clSetKernelArg(k0, 0, sizeof(output), &output), + "clSetKernelArg failed"); + + + // Successful case. + // ////////////////////////////////////////////////////////////////////////////////////// + { + cl_event e[4]; + cl_uint N = sizeof e / sizeof(cl_event); + + log_info("Enqueuing tasks\n"); + for (cl_uint i = 0; i != N; ++i) + CL_EXIT_ERROR(clEnqueueTask(queue, k0, 1, &u1, &e[i]), + "clEnqueueTaskFailed"); + + log_info("Checking task status before setting user event status\n"); + for (cl_uint i = 0; i != N; ++i) + { + CL_EXIT_ERROR(clGetEventInfo(e[i], + CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof s, &s, 0), + "clGetEventInfo failed"); + CL_EXIT_ERROR( + (s >= CL_SUBMITTED) ? CL_SUCCESS : -1, + "clGetEventInfo %u returned wrong status before user event", i); + } + + log_info("Setting user event status to complete\n"); + CL_EXIT_ERROR(clSetUserEventStatus(u1, CL_COMPLETE), + "clSetUserEventStatus failed"); + + log_info("Waiting for tasks to finish executing\n"); + CL_EXIT_ERROR(clWaitForEvents(1, &e[N - 1]), "clWaitForEvent failed"); + + log_info("Checking task status after setting user event status\n"); + for (cl_uint i = 0; i != N; ++i) + { + CL_EXIT_ERROR(clGetEventInfo(e[i], + CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof s, &s, 0), + "clGetEventInfo failed"); + CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1, + "clGetEventInfo %u returned wrong status %04x after " + "successful user event", + i, s); + } + + CL_EXIT_ERROR(clReleaseEvent(u1), "clReleaseEvent failed"); + + for (cl_uint i = 0; i != N; ++i) + CL_EXIT_ERROR(clReleaseEvent(e[i]), "clReleaseEvent failed"); + + log_info("Successful user event case passed.\n"); } - log_info("Setting user event status to unsuccessful result\n"); - CL_EXIT_ERROR(clSetUserEventStatus(u2,-1),"clSetUserEventStatus failed"); - - log_info("Waiting for tasks to finish executing\n"); - CL_EXIT_ERROR((clWaitForEvents( N, &e[0] )!=CL_SUCCESS) ? CL_SUCCESS : -1,"clWaitForEvent succeeded when it should have failed"); - - log_info("Checking task status after setting user event status\n"); - for (cl_uint i = 0; i != N; ++i) { - CL_EXIT_ERROR(clGetEventInfo(e[i],CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof s,&s,0),"clGetEventInfo failed"); - CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1,"clGetEventInfo %u returned wrong status %04x after unsuccessful user event",i,s); + // Test unsuccessful user event case. + // /////////////////////////////////////////////////////////////////// + { + cl_event u2 = clCreateUserEvent(context, &err); + CL_EXIT_ERROR(err, "clCreateUserEvent failed"); + + cl_event e[4]; + cl_uint N = sizeof e / sizeof(cl_event); + + log_info("Enqueuing tasks\n"); + for (cl_uint i = 0; i != N; ++i) + CL_EXIT_ERROR(clEnqueueTask(queue, k0, 1, &u2, &e[i]), + "clEnqueueTaskFailed"); + + log_info("Checking task status before setting user event status\n"); + for (cl_uint i = 0; i != N; ++i) + { + CL_EXIT_ERROR(clGetEventInfo(e[i], + CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof s, &s, 0), + "clGetEventInfo failed"); + CL_EXIT_ERROR( + (s == CL_QUEUED || s == CL_SUBMITTED) ? CL_SUCCESS : -1, + "clGetEventInfo %u returned wrong status %d before user event", + i, (int)s); + } + + log_info("Setting user event status to unsuccessful result\n"); + CL_EXIT_ERROR(clSetUserEventStatus(u2, -1), + "clSetUserEventStatus failed"); + + log_info("Waiting for tasks to finish executing\n"); + CL_EXIT_ERROR((clWaitForEvents(N, &e[0]) != CL_SUCCESS) ? CL_SUCCESS + : -1, + "clWaitForEvent succeeded when it should have failed"); + + log_info("Checking task status after setting user event status\n"); + for (cl_uint i = 0; i != N; ++i) + { + CL_EXIT_ERROR(clGetEventInfo(e[i], + CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof s, &s, 0), + "clGetEventInfo failed"); + CL_EXIT_ERROR((s != CL_QUEUED) ? CL_SUCCESS : -1, + "clGetEventInfo %u returned wrong status %04x after " + "unsuccessful user event", + i, s); + } + + CL_EXIT_ERROR(clReleaseEvent(u2), "clReleaseEvent failed"); + + for (cl_uint i = 0; i != N; ++i) + CL_EXIT_ERROR(clReleaseEvent(e[i]), "clReleaseEvent failed"); + + log_info("Unsuccessful user event case passed.\n"); } - CL_EXIT_ERROR(clReleaseEvent(u2),"clReleaseEvent failed"); - - for (cl_uint i = 0; i != N; ++i) - CL_EXIT_ERROR(clReleaseEvent(e[i]),"clReleaseEvent failed"); - - log_info("Unsuccessful user event case passed.\n"); - } - - clReleaseKernel(k0); - clReleaseProgram(program); - clReleaseMemObject(output); - - return 0; + clReleaseKernel(k0); + clReleaseProgram(program); + clReleaseMemObject(output); + return 0; } - diff --git a/test_conformance/events/test_userevents_multithreaded.cpp b/test_conformance/events/test_userevents_multithreaded.cpp index 51ef2226..a7845bf1 100644 --- a/test_conformance/events/test_userevents_multithreaded.cpp +++ b/test_conformance/events/test_userevents_multithreaded.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -19,8 +19,8 @@ #include <thread> -#if !defined (_MSC_VER) - #include <unistd.h> +#if !defined(_MSC_VER) +#include <unistd.h> #endif // !_MSC_VER void trigger_user_event(cl_event *event) @@ -30,44 +30,44 @@ void trigger_user_event(cl_event *event) clSetUserEventStatus(*event, CL_COMPLETE); } -int test_userevents_multithreaded( cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements ) +int test_userevents_multithreaded(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) { cl_int error; // Set up a user event to act as a gate - clEventWrapper gateEvent = clCreateUserEvent( context, &error ); - test_error( error, "Unable to create user gate event" ); + clEventWrapper gateEvent = clCreateUserEvent(context, &error); + test_error(error, "Unable to create user gate event"); // Set up a few actions gated on the user event NDRangeKernelAction action1; ReadBufferAction action2; WriteBufferAction action3; - clEventWrapper actionEvents[ 3 ]; - Action * actions[] = { &action1, &action2, &action3, NULL }; + clEventWrapper actionEvents[3]; + Action *actions[] = { &action1, &action2, &action3, NULL }; - for( int i = 0; actions[ i ] != NULL; i++ ) + for (int i = 0; actions[i] != NULL; i++) { - error = actions[ i ]->Setup( deviceID, context, queue ); - test_error( error, "Unable to set up test action" ); + error = actions[i]->Setup(deviceID, context, queue); + test_error(error, "Unable to set up test action"); - error = actions[ i ]->Execute( queue, 1, &gateEvent, &actionEvents[ i ] ); - test_error( error, "Unable to execute test action" ); + error = actions[i]->Execute(queue, 1, &gateEvent, &actionEvents[i]); + test_error(error, "Unable to execute test action"); } // Now, instead of releasing the gate, we spawn a separate thread to do so - log_info( "\tStarting trigger thread...\n" ); + log_info("\tStarting trigger thread...\n"); std::thread thread(trigger_user_event, &gateEvent); - log_info( "\tWaiting for actions...\n" ); - error = clWaitForEvents( 3, &actionEvents[ 0 ] ); - test_error( error, "Unable to wait for action events" ); + log_info("\tWaiting for actions...\n"); + error = clWaitForEvents(3, &actionEvents[0]); + test_error(error, "Unable to wait for action events"); thread.join(); - log_info( "\tActions completed.\n" ); + log_info("\tActions completed.\n"); // If we got here without error, we're good return 0; } - diff --git a/test_conformance/events/test_waitlists.cpp b/test_conformance/events/test_waitlists.cpp index e23cacf4..6036451f 100644 --- a/test_conformance/events/test_waitlists.cpp +++ b/test_conformance/events/test_waitlists.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,306 +17,374 @@ #include "action_classes.h" -extern const char *IGetStatusString( cl_int status ); +extern const char *IGetStatusString(cl_int status); #define PRINT_OPS 0 -int test_waitlist( cl_device_id device, cl_context context, cl_command_queue queue, Action *actionToTest, bool multiple ) +int test_waitlist(cl_device_id device, cl_context context, + cl_command_queue queue, Action *actionToTest, bool multiple) { - NDRangeKernelAction actions[ 2 ]; - clEventWrapper events[ 3 ]; - cl_int status[ 3 ]; + NDRangeKernelAction actions[2]; + clEventWrapper events[3]; + cl_int status[3]; cl_int error; - if (multiple) - log_info("\tExecuting reference event 0, then reference event 1 with reference event 0 in its waitlist, then test event 2 with reference events 0 and 1 in its waitlist.\n"); - else - log_info("\tExecuting reference event 0, then test event 2 with reference event 0 in its waitlist.\n"); + if (multiple) + log_info("\tExecuting reference event 0, then reference event 1 with " + "reference event 0 in its waitlist, then test event 2 with " + "reference events 0 and 1 in its waitlist.\n"); + else + log_info("\tExecuting reference event 0, then test event 2 with " + "reference event 0 in its waitlist.\n"); // Set up the first base action to wait against - error = actions[ 0 ].Setup( device, context, queue ); - test_error( error, "Unable to setup base event to wait against" ); + error = actions[0].Setup(device, context, queue); + test_error(error, "Unable to setup base event to wait against"); - if( multiple ) + if (multiple) { // Set up a second event to wait against - error = actions[ 1 ].Setup( device, context, queue ); - test_error( error, "Unable to setup second base event to wait against" ); + error = actions[1].Setup(device, context, queue); + test_error(error, "Unable to setup second base event to wait against"); } // Now set up the actual action to test - error = actionToTest->Setup( device, context, queue ); - test_error( error, "Unable to set up test event" ); + error = actionToTest->Setup(device, context, queue); + test_error(error, "Unable to set up test event"); // Execute all events now - if (PRINT_OPS) log_info("\tExecuting action 0...\n"); - error = actions[ 0 ].Execute( queue, 0, NULL, &events[ 0 ] ); - test_error( error, "Unable to execute first event" ); + if (PRINT_OPS) log_info("\tExecuting action 0...\n"); + error = actions[0].Execute(queue, 0, NULL, &events[0]); + test_error(error, "Unable to execute first event"); - if( multiple ) + if (multiple) { - if (PRINT_OPS) log_info("\tExecuting action 1...\n"); - error = actions[ 1 ].Execute( queue, 1, &events[0], &events[ 1 ] ); - test_error( error, "Unable to execute second event" ); + if (PRINT_OPS) log_info("\tExecuting action 1...\n"); + error = actions[1].Execute(queue, 1, &events[0], &events[1]); + test_error(error, "Unable to execute second event"); } // Sanity check - if( multiple ) { - if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); - error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL ); - test_error( error, "Unable to get event status" ); - } - if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); - error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL ); - test_error( error, "Unable to get event status" ); - - log_info("\t\tEvent status after starting reference events: reference event 0: %s, reference event 1: %s, test event 2: %s.\n", - IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), "N/A"); - - if( ( status[ 0 ] == CL_COMPLETE ) || ( multiple && status[ 1 ] == CL_COMPLETE ) ) + if (multiple) + { + if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[1]), &status[1], NULL); + test_error(error, "Unable to get event status"); + } + if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[0]), &status[0], NULL); + test_error(error, "Unable to get event status"); + + log_info("\t\tEvent status after starting reference events: reference " + "event 0: %s, reference event 1: %s, test event 2: %s.\n", + IGetStatusString(status[0]), + (multiple ? IGetStatusString(status[1]) : "N/A"), "N/A"); + + if ((status[0] == CL_COMPLETE) || (multiple && status[1] == CL_COMPLETE)) { - log_info( "WARNING: Reference event(s) already completed before we could execute test event! Possible that the reference event blocked (implicitly passing)\n" ); + log_info("WARNING: Reference event(s) already completed before we " + "could execute test event! Possible that the reference event " + "blocked (implicitly passing)\n"); return 0; } - if (PRINT_OPS) log_info("\tExecuting action to test...\n"); - error = actionToTest->Execute( queue, ( multiple ) ? 2 : 1, &events[ 0 ], &events[ 2 ] ); - test_error( error, "Unable to execute test event" ); + if (PRINT_OPS) log_info("\tExecuting action to test...\n"); + error = actionToTest->Execute(queue, (multiple) ? 2 : 1, &events[0], + &events[2]); + test_error(error, "Unable to execute test event"); // Hopefully, the first event is still running - if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); - error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL ); - test_error( error, "Unable to get event status" ); - if( multiple ) { - if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); - error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL ); - test_error( error, "Unable to get event status" ); - } - if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); - error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL ); - test_error( error, "Unable to get event status" ); - - log_info("\t\tEvent status after starting test event: reference event 0: %s, reference event 1: %s, test event 2: %s.\n", - IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] )); - - if( multiple ) + if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); + error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[2]), &status[2], NULL); + test_error(error, "Unable to get event status"); + if (multiple) + { + if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[1]), &status[1], NULL); + test_error(error, "Unable to get event status"); + } + if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[0]), &status[0], NULL); + test_error(error, "Unable to get event status"); + + log_info("\t\tEvent status after starting test event: reference event 0: " + "%s, reference event 1: %s, test event 2: %s.\n", + IGetStatusString(status[0]), + (multiple ? IGetStatusString(status[1]) : "N/A"), + IGetStatusString(status[2])); + + if (multiple) { - if( status[ 0 ] == CL_COMPLETE && status[ 1 ] == CL_COMPLETE ) + if (status[0] == CL_COMPLETE && status[1] == CL_COMPLETE) { - log_info( "WARNING: Both events completed, so unable to test further (implicitly passing).\n" ); - clFinish( queue ); + log_info("WARNING: Both events completed, so unable to test " + "further (implicitly passing).\n"); + clFinish(queue); return 0; } - if(status[1] == CL_COMPLETE && status[0] != CL_COMPLETE) - { - log_error("ERROR: Test failed because the second wait event is complete and the first is not.(status: 0: %s and 1: %s)\n", IGetStatusString( status[ 0 ] ), IGetStatusString( status[ 1 ] ) ); - clFinish( queue ); + if (status[1] == CL_COMPLETE && status[0] != CL_COMPLETE) + { + log_error( + "ERROR: Test failed because the second wait event is complete " + "and the first is not.(status: 0: %s and 1: %s)\n", + IGetStatusString(status[0]), IGetStatusString(status[1])); + clFinish(queue); return -1; - } + } } else { - if( status[ 0 ] == CL_COMPLETE ) + if (status[0] == CL_COMPLETE) { - log_info( "WARNING: Reference event completed, so unable to test further (implicitly passing).\n" ); - clFinish( queue ); + log_info("WARNING: Reference event completed, so unable to test " + "further (implicitly passing).\n"); + clFinish(queue); return 0; } - if( status[ 0 ] != CL_RUNNING && status[ 0 ] != CL_QUEUED && status[ 0 ] != CL_SUBMITTED ) + if (status[0] != CL_RUNNING && status[0] != CL_QUEUED + && status[0] != CL_SUBMITTED) { - log_error( "ERROR: Test failed because first wait event is not currently running, queued, or submitted! (status: 0: %s)\n", IGetStatusString( status[ 0 ] ) ); - clFinish( queue ); + log_error( + "ERROR: Test failed because first wait event is not currently " + "running, queued, or submitted! (status: 0: %s)\n", + IGetStatusString(status[0])); + clFinish(queue); return -1; } } - if( status[ 2 ] != CL_QUEUED && status[ 2 ] != CL_SUBMITTED ) + if (status[2] != CL_QUEUED && status[2] != CL_SUBMITTED) { - log_error( "ERROR: Test event is not waiting to run! (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) ); - clFinish( queue ); + log_error("ERROR: Test event is not waiting to run! (status: 2: %s)\n", + IGetStatusString(status[2])); + clFinish(queue); return -1; } // Now wait for the first reference event - if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n"); - error = clWaitForEvents( 1, &events[ 0 ] ); - test_error( error, "Unable to wait for reference event" ); + if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n"); + error = clWaitForEvents(1, &events[0]); + test_error(error, "Unable to wait for reference event"); // Grab statuses again - if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); - error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL ); - test_error( error, "Unable to get event status" ); - if( multiple ) { - if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); - error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL ); - test_error( error, "Unable to get event status" ); - } - if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); - error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL ); - test_error( error, "Unable to get event status" ); - - log_info("\t\tEvent status after waiting for reference event 0: reference event 0: %s, reference event 1: %s, test event 2: %s.\n", - IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] )); + if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); + error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[2]), &status[2], NULL); + test_error(error, "Unable to get event status"); + if (multiple) + { + if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[1]), &status[1], NULL); + test_error(error, "Unable to get event status"); + } + if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[0]), &status[0], NULL); + test_error(error, "Unable to get event status"); + + log_info("\t\tEvent status after waiting for reference event 0: reference " + "event 0: %s, reference event 1: %s, test event 2: %s.\n", + IGetStatusString(status[0]), + (multiple ? IGetStatusString(status[1]) : "N/A"), + IGetStatusString(status[2])); // Sanity - if( status[ 0 ] != CL_COMPLETE ) + if (status[0] != CL_COMPLETE) { - log_error( "ERROR: Waited for first event but it's not complete (status: 0: %s)\n", IGetStatusString( status[ 0 ] ) ); - clFinish( queue ); + log_error("ERROR: Waited for first event but it's not complete " + "(status: 0: %s)\n", + IGetStatusString(status[0])); + clFinish(queue); return -1; } - // If we're multiple, and the second event isn't complete, then our test event should still be queued - if( multiple && status[ 1 ] != CL_COMPLETE ) + // If we're multiple, and the second event isn't complete, then our test + // event should still be queued + if (multiple && status[1] != CL_COMPLETE) { - if( status[ 1 ] == CL_RUNNING && status[ 2 ] == CL_RUNNING ) { - log_error("ERROR: Test event and second event are both running.\n"); - clFinish( queue ); - return -1; - } - if( status[ 2 ] != CL_QUEUED && status[ 2 ] != CL_SUBMITTED ) + if (status[1] == CL_RUNNING && status[2] == CL_RUNNING) + { + log_error("ERROR: Test event and second event are both running.\n"); + clFinish(queue); + return -1; + } + if (status[2] != CL_QUEUED && status[2] != CL_SUBMITTED) { - log_error( "ERROR: Test event did not wait for second event before starting! (status of ref: 1: %s, of test: 2: %s)\n", IGetStatusString( status[ 1 ] ), IGetStatusString( status[ 2 ] ) ); - clFinish( queue ); + log_error("ERROR: Test event did not wait for second event before " + "starting! (status of ref: 1: %s, of test: 2: %s)\n", + IGetStatusString(status[1]), IGetStatusString(status[2])); + clFinish(queue); return -1; } // Now wait for second event to complete, too - if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n"); - error = clWaitForEvents( 1, &events[ 1 ] ); - test_error( error, "Unable to wait for second reference event" ); + if (PRINT_OPS) log_info("\tWaiting for action 1 to finish...\n"); + error = clWaitForEvents(1, &events[1]); + test_error(error, "Unable to wait for second reference event"); // Grab statuses again - if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); - error = clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL ); - test_error( error, "Unable to get event status" ); - if( multiple ) { - if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); - error = clGetEventInfo( events[ 1 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 1 ] ), &status[ 1 ], NULL ); - test_error( error, "Unable to get event status" ); - } - if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); - error = clGetEventInfo( events[ 0 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 0 ] ), &status[ 0 ], NULL ); - test_error( error, "Unable to get event status" ); - - log_info("\t\tEvent status after waiting for reference event 1: reference event 0: %s, reference event 1: %s, test event 2: %s.\n", - IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] )); + if (PRINT_OPS) log_info("\tChecking status of action to test 2...\n"); + error = clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[2]), &status[2], NULL); + test_error(error, "Unable to get event status"); + if (multiple) + { + if (PRINT_OPS) log_info("\tChecking status of action 1...\n"); + error = clGetEventInfo(events[1], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[1]), &status[1], NULL); + test_error(error, "Unable to get event status"); + } + if (PRINT_OPS) log_info("\tChecking status of action 0...\n"); + error = clGetEventInfo(events[0], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[0]), &status[0], NULL); + test_error(error, "Unable to get event status"); + + log_info( + "\t\tEvent status after waiting for reference event 1: reference " + "event 0: %s, reference event 1: %s, test event 2: %s.\n", + IGetStatusString(status[0]), + (multiple ? IGetStatusString(status[1]) : "N/A"), + IGetStatusString(status[2])); // Sanity - if( status[ 1 ] != CL_COMPLETE ) + if (status[1] != CL_COMPLETE) { - log_error( "ERROR: Waited for second reference event but it didn't complete (status: 1: %s)\n", IGetStatusString( status[ 1 ] ) ); - clFinish( queue ); + log_error("ERROR: Waited for second reference event but it didn't " + "complete (status: 1: %s)\n", + IGetStatusString(status[1])); + clFinish(queue); return -1; } } - // At this point, the test event SHOULD be running, but if it completed, we consider it a pass - if( status[ 2 ] == CL_COMPLETE ) + // At this point, the test event SHOULD be running, but if it completed, we + // consider it a pass + if (status[2] == CL_COMPLETE) { - log_info( "WARNING: Test event already completed. Assumed valid.\n" ); - clFinish( queue ); + log_info("WARNING: Test event already completed. Assumed valid.\n"); + clFinish(queue); return 0; } - if( status[ 2 ] != CL_RUNNING && status[ 2 ] != CL_SUBMITTED && status[ 2 ] != CL_QUEUED) + if (status[2] != CL_RUNNING && status[2] != CL_SUBMITTED + && status[2] != CL_QUEUED) { - log_error( "ERROR: Second event did not start running after reference event(s) completed! (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) ); - clFinish( queue ); + log_error("ERROR: Second event did not start running after reference " + "event(s) completed! (status: 2: %s)\n", + IGetStatusString(status[2])); + clFinish(queue); return -1; } // Wait for the test event, then return - if (PRINT_OPS) log_info("\tWaiting for action 2 to test to finish...\n"); - error = clWaitForEvents( 1, &events[ 2 ] ); - test_error( error, "Unable to wait for test event" ); + if (PRINT_OPS) log_info("\tWaiting for action 2 to test to finish...\n"); + error = clWaitForEvents(1, &events[2]); + test_error(error, "Unable to wait for test event"); - error |= clGetEventInfo( events[ 2 ], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof( status[ 2 ] ), &status[ 2 ], NULL ); - test_error( error, "Unable to get event status" ); + error |= clGetEventInfo(events[2], CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(status[2]), &status[2], NULL); + test_error(error, "Unable to get event status"); - log_info("\t\tEvent status after waiting for test event: reference event 0: %s, reference event 1: %s, test event 2: %s.\n", - IGetStatusString( status[ 0 ] ), (multiple ? IGetStatusString( status[ 1 ] ) : "N/A"), IGetStatusString( status[ 2 ] )); + log_info("\t\tEvent status after waiting for test event: reference event " + "0: %s, reference event 1: %s, test event 2: %s.\n", + IGetStatusString(status[0]), + (multiple ? IGetStatusString(status[1]) : "N/A"), + IGetStatusString(status[2])); - // Sanity - if( status[ 2 ] != CL_COMPLETE ) - { - log_error( "ERROR: Test event didn't complete (status: 2: %s)\n", IGetStatusString( status[ 2 ] ) ); - clFinish( queue ); - return -1; - } + // Sanity + if (status[2] != CL_COMPLETE) + { + log_error("ERROR: Test event didn't complete (status: 2: %s)\n", + IGetStatusString(status[2])); + clFinish(queue); + return -1; + } - clFinish(queue); + clFinish(queue); return 0; } -#define TEST_ACTION( name ) \ - { \ - name##Action action; \ - log_info( "-- Testing " #name " (waiting on 1 event)...\n" ); \ - if( ( error = test_waitlist( deviceID, context, queue, &action, false ) ) != CL_SUCCESS ) \ - retVal++; \ - clFinish( queue ); \ - } \ - if( error == CL_SUCCESS ) /* Only run multiples test if single test passed */ \ - { \ - name##Action action; \ - log_info( "-- Testing " #name " (waiting on 2 events)...\n" ); \ - if( ( error = test_waitlist( deviceID, context, queue, &action, true ) ) != CL_SUCCESS ) \ - retVal++; \ - clFinish( queue ); \ +#define TEST_ACTION(name) \ + { \ + name##Action action; \ + log_info("-- Testing " #name " (waiting on 1 event)...\n"); \ + if ((error = test_waitlist(deviceID, context, queue, &action, false)) \ + != CL_SUCCESS) \ + retVal++; \ + clFinish(queue); \ + } \ + if (error \ + == CL_SUCCESS) /* Only run multiples test if single test passed */ \ + { \ + name##Action action; \ + log_info("-- Testing " #name " (waiting on 2 events)...\n"); \ + if ((error = test_waitlist(deviceID, context, queue, &action, true)) \ + != CL_SUCCESS) \ + retVal++; \ + clFinish(queue); \ } -int test_waitlists( cl_device_id deviceID, cl_context context, cl_command_queue oldQueue, int num_elements ) +int test_waitlists(cl_device_id deviceID, cl_context context, + cl_command_queue oldQueue, int num_elements) { cl_int error; int retVal = 0; cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - if( !checkDeviceForQueueSupport( deviceID, props ) ) + if (!checkDeviceForQueueSupport(deviceID, props)) { - log_info( "WARNING: Device does not support out-of-order exec mode; skipping test.\n" ); + log_info("WARNING: Device does not support out-of-order exec mode; " + "skipping test.\n"); return 0; } - clCommandQueueWrapper queue = clCreateCommandQueue( context, deviceID, props, &error ); + clCommandQueueWrapper queue = + clCreateCommandQueue(context, deviceID, props, &error); test_error(error, "Unable to create out-of-order queue"); - log_info( "\n" ); + log_info("\n"); - TEST_ACTION( NDRangeKernel ) + TEST_ACTION(NDRangeKernel) - TEST_ACTION( ReadBuffer ) - TEST_ACTION( WriteBuffer ) - TEST_ACTION( MapBuffer ) - TEST_ACTION( UnmapBuffer ) + TEST_ACTION(ReadBuffer) + TEST_ACTION(WriteBuffer) + TEST_ACTION(MapBuffer) + TEST_ACTION(UnmapBuffer) - if( checkForImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED ) + if (checkForImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED) { - log_info( "\nNote: device does not support images. Skipping remainder of waitlist tests...\n" ); + log_info("\nNote: device does not support images. Skipping remainder " + "of waitlist tests...\n"); } else { - TEST_ACTION( ReadImage2D ) - TEST_ACTION( WriteImage2D ) - TEST_ACTION( CopyImage2Dto2D ) - TEST_ACTION( Copy2DImageToBuffer ) - TEST_ACTION( CopyBufferTo2DImage ) - TEST_ACTION( MapImage ) - - if( checkFor3DImageSupport( deviceID ) == CL_IMAGE_FORMAT_NOT_SUPPORTED ) - log_info("Device does not support 3D images. Skipping remainder of waitlist tests...\n"); + TEST_ACTION(ReadImage2D) + TEST_ACTION(WriteImage2D) + TEST_ACTION(CopyImage2Dto2D) + TEST_ACTION(Copy2DImageToBuffer) + TEST_ACTION(CopyBufferTo2DImage) + TEST_ACTION(MapImage) + + if (checkFor3DImageSupport(deviceID) == CL_IMAGE_FORMAT_NOT_SUPPORTED) + log_info("Device does not support 3D images. Skipping remainder of " + "waitlist tests...\n"); else { - TEST_ACTION( ReadImage3D ) - TEST_ACTION( WriteImage3D ) - TEST_ACTION( CopyImage2Dto3D ) - TEST_ACTION( CopyImage3Dto2D ) - TEST_ACTION( CopyImage3Dto3D ) - TEST_ACTION( Copy3DImageToBuffer ) - TEST_ACTION( CopyBufferTo3DImage ) + TEST_ACTION(ReadImage3D) + TEST_ACTION(WriteImage3D) + TEST_ACTION(CopyImage2Dto3D) + TEST_ACTION(CopyImage3Dto2D) + TEST_ACTION(CopyImage3Dto3D) + TEST_ACTION(Copy3DImageToBuffer) + TEST_ACTION(CopyBufferTo3DImage) } } return retVal; } - diff --git a/test_conformance/extensions/CMakeLists.txt b/test_conformance/extensions/CMakeLists.txt index 53d77ee5..d95d29aa 100644 --- a/test_conformance/extensions/CMakeLists.txt +++ b/test_conformance/extensions/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory( cl_ext_cxx_for_opencl ) +add_subdirectory( cl_khr_command_buffer ) add_subdirectory( cl_khr_dx9_media_sharing ) diff --git a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt new file mode 100644 index 00000000..ac259f6d --- /dev/null +++ b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt @@ -0,0 +1,8 @@ +set(MODULE_NAME CL_KHR_COMMAND_BUFFER) + +set(${MODULE_NAME}_SOURCES + main.cpp + basic_command_buffer.cpp +) + +include(../../CMakeCommon.txt) diff --git a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp new file mode 100644 index 00000000..62a02d83 --- /dev/null +++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp @@ -0,0 +1,588 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "command_buffer_test_base.h" +#include "procs.h" +#include "harness/typeWrappers.h" + +#include <algorithm> +#include <cstring> +#include <vector> + +#define CHECK_VERIFICATION_ERROR(reference, result, index) \ + { \ + if (reference != result) \ + { \ + log_error("Expected %d was %d at index %u\n", reference, result, \ + index); \ + return TEST_FAIL; \ + } \ + } + +namespace { + +// Helper test fixture for constructing OpenCL objects used in testing +// a variety of simple command-buffer enqueue scenarios. +struct BasicCommandBufferTest : CommandBufferTestBase +{ + + BasicCommandBufferTest(cl_device_id device, cl_context context, + cl_command_queue queue) + : CommandBufferTestBase(device), context(context), queue(queue), + command_buffer(this), simultaneous_use(false), + out_of_order_support(false), num_elements(0) + {} + + virtual bool Skip() + { + cl_command_queue_properties required_properties; + cl_int error = clGetDeviceInfo( + device, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR, + sizeof(required_properties), &required_properties, NULL); + test_error(error, + "Unable to query " + "CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR"); + + cl_command_queue_properties queue_properties; + + error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, + sizeof(queue_properties), + &queue_properties, NULL); + test_error(error, "Unable to query CL_QUEUE_PROPERTIES"); + + // Skip if queue properties don't contain those required + return required_properties != (required_properties & queue_properties); + } + + virtual cl_int SetUp(int elements) + { + cl_int error = init_extension_functions(); + if (error != CL_SUCCESS) + { + return error; + } + + // Query if device supports simultaneous use + cl_device_command_buffer_capabilities_khr capabilities; + error = + clGetDeviceInfo(device, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, + sizeof(capabilities), &capabilities, NULL); + test_error(error, + "Unable to query CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR"); + simultaneous_use = + capabilities & CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR; + out_of_order_support = + capabilities & CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR; + + if (elements <= 0) + { + return CL_INVALID_VALUE; + } + num_elements = static_cast<size_t>(elements); + + // Kernel performs a parallel copy from an input buffer to output buffer + // is created. + const char *kernel_str = + R"( + __kernel void copy(__global int* in, __global int* out) { + size_t id = get_global_id(0); + out[id] = in[id]; + })"; + + error = create_single_kernel_helper_create_program(context, &program, 1, + &kernel_str); + test_error(error, "Failed to create program with source"); + + error = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr); + test_error(error, "Failed to build program"); + + in_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, + sizeof(cl_int) * num_elements, nullptr, &error); + test_error(error, "clCreateBuffer failed"); + + out_mem = + clCreateBuffer(context, CL_MEM_WRITE_ONLY, + sizeof(cl_int) * num_elements, nullptr, &error); + test_error(error, "clCreateBuffer failed"); + + kernel = clCreateKernel(program, "copy", &error); + test_error(error, "Failed to create copy kernel"); + + error = clSetKernelArg(kernel, 0, sizeof(in_mem), &in_mem); + test_error(error, "clSetKernelArg failed"); + + error = clSetKernelArg(kernel, 1, sizeof(out_mem), &out_mem); + test_error(error, "clSetKernelArg failed"); + + if (simultaneous_use) + { + cl_command_buffer_properties_khr properties[3] = { + CL_COMMAND_BUFFER_FLAGS_KHR, + CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR, 0 + }; + command_buffer = + clCreateCommandBufferKHR(1, &queue, properties, &error); + } + else + { + command_buffer = + clCreateCommandBufferKHR(1, &queue, nullptr, &error); + } + test_error(error, "clCreateCommandBufferKHR failed"); + + return CL_SUCCESS; + } + + // Test body returning an OpenCL error code + virtual cl_int Run() = 0; + + +protected: + size_t data_size() const { return num_elements * sizeof(cl_int); } + + cl_context context; + cl_command_queue queue; + clCommandBufferWrapper command_buffer; + clProgramWrapper program; + clKernelWrapper kernel; + clMemWrapper in_mem, out_mem; + size_t num_elements; + + // Device support query results + bool simultaneous_use; + bool out_of_order_support; +}; + +// Test enqueuing a command-buffer containing a single NDRange command once +struct BasicEnqueueTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + + cl_int Run() override + { + cl_int error = clCommandNDRangeKernelKHR( + command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements, + nullptr, 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + error = clFinalizeCommandBufferKHR(command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + const cl_int pattern = 42; + error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0, + data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + std::vector<cl_int> output_data(num_elements); + error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(), + output_data.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + for (size_t i = 0; i < num_elements; i++) + { + CHECK_VERIFICATION_ERROR(pattern, output_data[i], i); + } + + return CL_SUCCESS; + } +}; + +// Test enqueuing a command-buffer containing multiple command, including +// operations other than NDRange kernel execution. +struct MixedCommandsTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + + cl_int Run() override + { + cl_int error; + const size_t iterations = 4; + clMemWrapper result_mem = + clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(cl_int) * iterations, nullptr, &error); + test_error(error, "clCreateBuffer failed"); + + const cl_int pattern_base = 42; + for (size_t i = 0; i < iterations; i++) + { + const cl_int pattern = pattern_base + i; + cl_int error = clCommandFillBufferKHR( + command_buffer, nullptr, in_mem, &pattern, sizeof(cl_int), 0, + data_size(), 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandFillBufferKHR failed"); + + error = clCommandNDRangeKernelKHR( + command_buffer, nullptr, nullptr, kernel, 1, nullptr, + &num_elements, nullptr, 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + const size_t result_offset = i * sizeof(cl_int); + error = clCommandCopyBufferKHR( + command_buffer, nullptr, out_mem, result_mem, 0, result_offset, + sizeof(cl_int), 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandCopyBufferKHR failed"); + } + + error = clFinalizeCommandBufferKHR(command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + std::vector<cl_int> result_data(num_elements); + error = clEnqueueReadBuffer(queue, result_mem, CL_TRUE, 0, + iterations * sizeof(cl_int), + result_data.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + for (size_t i = 0; i < iterations; i++) + { + const cl_int ref = pattern_base + i; + CHECK_VERIFICATION_ERROR(ref, result_data[i], i); + } + + return CL_SUCCESS; + } +}; + +// Test enqueueing a command-buffer blocked on a user-event +struct UserEventTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + + cl_int Run() override + { + cl_int error = clCommandNDRangeKernelKHR( + command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements, + nullptr, 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + error = clFinalizeCommandBufferKHR(command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + clEventWrapper user_event = clCreateUserEvent(context, &error); + test_error(error, "clCreateUserEvent failed"); + + const cl_int pattern = 42; + error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0, + data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1, + &user_event, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + std::vector<cl_int> output_data(num_elements); + error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(), + output_data.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + error = clSetUserEventStatus(user_event, CL_COMPLETE); + test_error(error, "clSetUserEventStatus failed"); + + error = clFinish(queue); + test_error(error, "clFinish failed"); + + for (size_t i = 0; i < num_elements; i++) + { + CHECK_VERIFICATION_ERROR(pattern, output_data[i], i); + } + + return CL_SUCCESS; + } +}; + +// Test flushing the command-queue between command-buffer enqueues +struct ExplicitFlushTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + + cl_int Run() override + { + cl_int error = clCommandNDRangeKernelKHR( + command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements, + nullptr, 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + error = clFinalizeCommandBufferKHR(command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + const cl_int pattern_A = 42; + error = clEnqueueFillBuffer(queue, in_mem, &pattern_A, sizeof(cl_int), + 0, data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clFlush(queue); + test_error(error, "clFlush failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + std::vector<cl_int> output_data_A(num_elements); + error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(), + output_data_A.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + const cl_int pattern_B = 0xA; + error = clEnqueueFillBuffer(queue, in_mem, &pattern_B, sizeof(cl_int), + 0, data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clFlush(queue); + test_error(error, "clFlush failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + error = clFlush(queue); + test_error(error, "clFlush failed"); + + std::vector<cl_int> output_data_B(num_elements); + error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(), + output_data_B.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + error = clFinish(queue); + test_error(error, "clFinish failed"); + + for (size_t i = 0; i < num_elements; i++) + { + CHECK_VERIFICATION_ERROR(pattern_A, output_data_A[i], i); + + CHECK_VERIFICATION_ERROR(pattern_B, output_data_B[i], i); + } + return CL_SUCCESS; + } + + bool Skip() override + { + return !simultaneous_use || BasicCommandBufferTest::Skip(); + } +}; + +// Test enqueueing a command-buffer twice separated by another enqueue operation +struct InterleavedEnqueueTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + + cl_int Run() override + { + cl_int error = clCommandNDRangeKernelKHR( + command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements, + nullptr, 0, nullptr, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + error = clFinalizeCommandBufferKHR(command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + cl_int pattern = 42; + error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0, + data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + pattern = 0xABCD; + error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0, + data_size(), 0, nullptr, nullptr); + test_error(error, "clEnqueueFillBuffer failed"); + + error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0, + nullptr, nullptr); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + error = clEnqueueCopyBuffer(queue, in_mem, out_mem, 0, 0, data_size(), + 0, nullptr, nullptr); + test_error(error, "clEnqueueCopyBuffer failed"); + + std::vector<cl_int> output_data(num_elements); + error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(), + output_data.data(), 0, nullptr, nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + for (size_t i = 0; i < num_elements; i++) + { + CHECK_VERIFICATION_ERROR(pattern, output_data[i], i); + } + + return CL_SUCCESS; + } + + bool Skip() override + { + return !simultaneous_use || BasicCommandBufferTest::Skip(); + } +}; + +// Test sync-points with an out-of-order command-buffer +struct OutOfOrderTest : public BasicCommandBufferTest +{ + using BasicCommandBufferTest::BasicCommandBufferTest; + OutOfOrderTest(cl_device_id device, cl_context context, + cl_command_queue queue) + : BasicCommandBufferTest(device, context, queue), + out_of_order_command_buffer(this), out_of_order_queue(nullptr), + event(nullptr) + {} + + cl_int Run() override + { + cl_sync_point_khr sync_points[2]; + + const cl_int pattern = 42; + cl_int error = + clCommandFillBufferKHR(out_of_order_command_buffer, nullptr, in_mem, + &pattern, sizeof(cl_int), 0, data_size(), 0, + nullptr, &sync_points[0], nullptr); + test_error(error, "clCommandFillBufferKHR failed"); + + const cl_int overwritten_pattern = 0xACDC; + error = clCommandFillBufferKHR(out_of_order_command_buffer, nullptr, + out_mem, &overwritten_pattern, + sizeof(cl_int), 0, data_size(), 0, + nullptr, &sync_points[1], nullptr); + test_error(error, "clCommandFillBufferKHR failed"); + + error = clCommandNDRangeKernelKHR( + out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr, + &num_elements, nullptr, 2, sync_points, nullptr, nullptr); + test_error(error, "clCommandNDRangeKernelKHR failed"); + + error = clFinalizeCommandBufferKHR(out_of_order_command_buffer); + test_error(error, "clFinalizeCommandBufferKHR failed"); + + error = clEnqueueCommandBufferKHR( + 0, nullptr, out_of_order_command_buffer, 0, nullptr, &event); + test_error(error, "clEnqueueCommandBufferKHR failed"); + + std::vector<cl_int> output_data(num_elements); + error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0, + data_size(), output_data.data(), 1, &event, + nullptr); + test_error(error, "clEnqueueReadBuffer failed"); + + for (size_t i = 0; i < num_elements; i++) + { + CHECK_VERIFICATION_ERROR(pattern, output_data[i], i); + } + + return CL_SUCCESS; + } + + cl_int SetUp(int elements) override + { + cl_int error = BasicCommandBufferTest::SetUp(elements); + test_error(error, "BasicCommandBufferTest::SetUp failed"); + + if (!out_of_order_support) + { + // Test will skip as device doesn't support out-of-order + // command-buffers + return CL_SUCCESS; + } + + out_of_order_queue = clCreateCommandQueue( + context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error); + test_error(error, "Unable to create command queue to test with"); + + out_of_order_command_buffer = + clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error); + test_error(error, "clCreateCommandBufferKHR failed"); + + return CL_SUCCESS; + } + + bool Skip() override + { + return !out_of_order_support || BasicCommandBufferTest::Skip(); + } + + clCommandQueueWrapper out_of_order_queue; + clCommandBufferWrapper out_of_order_command_buffer; + clEventWrapper event; +}; + +#undef CHECK_VERIFICATION_ERROR + +template <class T> +int MakeAndRunTest(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + CHECK_COMMAND_BUFFER_EXTENSION_AVAILABLE(device); + + auto test_fixture = T(device, context, queue); + cl_int error = test_fixture.SetUp(num_elements); + test_error_ret(error, "Error in test initialization", TEST_FAIL); + + if (test_fixture.Skip()) + { + return TEST_SKIPPED_ITSELF; + } + + error = test_fixture.Run(); + test_error_ret(error, "Test Failed", TEST_FAIL); + + return TEST_PASS; +} +} // anonymous namespace + +int test_single_ndrange(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<BasicEnqueueTest>(device, context, queue, + num_elements); +} + +int test_interleaved_enqueue(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<InterleavedEnqueueTest>(device, context, queue, + num_elements); +} + +int test_mixed_commands(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<MixedCommandsTest>(device, context, queue, + num_elements); +} + +int test_explicit_flush(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<ExplicitFlushTest>(device, context, queue, + num_elements); +} + +int test_user_events(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<UserEventTest>(device, context, queue, num_elements); +} + +int test_out_of_order(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return MakeAndRunTest<OutOfOrderTest>(device, context, queue, num_elements); +} diff --git a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h new file mode 100644 index 00000000..0fd2e4ec --- /dev/null +++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_test_base.h @@ -0,0 +1,177 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _CL_KHR_COMMAND_BUFFER_TEST_BASE_H +#define _CL_KHR_COMMAND_BUFFER_TEST_BASE_H + +#include <CL/cl_ext.h> +#include "harness/deviceInfo.h" +#include "harness/testHarness.h" + + +// Base class for setting function pointers to new extension entry points +struct CommandBufferTestBase +{ + CommandBufferTestBase(cl_device_id device): device(device) {} + + cl_int init_extension_functions() + { + cl_platform_id platform; + cl_int error = + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), + &platform, nullptr); + test_error(error, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed"); + + // If it is supported get the addresses of all the APIs here. +#define GET_EXTENSION_ADDRESS(FUNC) \ + FUNC = reinterpret_cast<FUNC##_fn>( \ + clGetExtensionFunctionAddressForPlatform(platform, #FUNC)); \ + if (FUNC == nullptr) \ + { \ + log_error("ERROR: clGetExtensionFunctionAddressForPlatform failed" \ + " with " #FUNC "\n"); \ + return TEST_FAIL; \ + } + + GET_EXTENSION_ADDRESS(clCreateCommandBufferKHR); + GET_EXTENSION_ADDRESS(clReleaseCommandBufferKHR); + GET_EXTENSION_ADDRESS(clRetainCommandBufferKHR); + GET_EXTENSION_ADDRESS(clFinalizeCommandBufferKHR); + GET_EXTENSION_ADDRESS(clEnqueueCommandBufferKHR); + GET_EXTENSION_ADDRESS(clCommandBarrierWithWaitListKHR); + GET_EXTENSION_ADDRESS(clCommandCopyBufferKHR); + GET_EXTENSION_ADDRESS(clCommandCopyBufferRectKHR); + GET_EXTENSION_ADDRESS(clCommandCopyBufferToImageKHR); + GET_EXTENSION_ADDRESS(clCommandCopyImageKHR); + GET_EXTENSION_ADDRESS(clCommandCopyImageToBufferKHR); + GET_EXTENSION_ADDRESS(clCommandFillBufferKHR); + GET_EXTENSION_ADDRESS(clCommandFillImageKHR); + GET_EXTENSION_ADDRESS(clCommandNDRangeKernelKHR); + GET_EXTENSION_ADDRESS(clGetCommandBufferInfoKHR); +#undef GET_EXTENSION_ADDRESS + return CL_SUCCESS; + } + + clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr; + clReleaseCommandBufferKHR_fn clReleaseCommandBufferKHR = nullptr; + clRetainCommandBufferKHR_fn clRetainCommandBufferKHR = nullptr; + clFinalizeCommandBufferKHR_fn clFinalizeCommandBufferKHR = nullptr; + clEnqueueCommandBufferKHR_fn clEnqueueCommandBufferKHR = nullptr; + clCommandBarrierWithWaitListKHR_fn clCommandBarrierWithWaitListKHR = + nullptr; + clCommandCopyBufferKHR_fn clCommandCopyBufferKHR = nullptr; + clCommandCopyBufferRectKHR_fn clCommandCopyBufferRectKHR = nullptr; + clCommandCopyBufferToImageKHR_fn clCommandCopyBufferToImageKHR = nullptr; + clCommandCopyImageKHR_fn clCommandCopyImageKHR = nullptr; + clCommandCopyImageToBufferKHR_fn clCommandCopyImageToBufferKHR = nullptr; + clCommandFillBufferKHR_fn clCommandFillBufferKHR = nullptr; + clCommandFillImageKHR_fn clCommandFillImageKHR = nullptr; + clCommandNDRangeKernelKHR_fn clCommandNDRangeKernelKHR = nullptr; + clGetCommandBufferInfoKHR_fn clGetCommandBufferInfoKHR = nullptr; + + cl_device_id device = nullptr; +}; + +// Wrapper class based off generic typeWrappers.h wrappers. However, because +// the release/retain functions are queried at runtime from the platform, +// rather than known at compile time we cannot link the instantiated template. +// Instead, pass an instance of `CommandBufferTestBase` on wrapper construction +// to access the release/retain functions. +class clCommandBufferWrapper { + cl_command_buffer_khr object = nullptr; + + void retain() + { + if (!object) return; + + auto err = base->clRetainCommandBufferKHR(object); + if (err != CL_SUCCESS) + { + print_error(err, "clRetainCommandBufferKHR() failed"); + std::abort(); + } + } + + void release() + { + if (!object) return; + + auto err = base->clReleaseCommandBufferKHR(object); + if (err != CL_SUCCESS) + { + print_error(err, "clReleaseCommandBufferKHR() failed"); + std::abort(); + } + } + + // Used to access release/retain functions + CommandBufferTestBase *base; + +public: + // We always want to have base available to dereference + clCommandBufferWrapper() = delete; + + clCommandBufferWrapper(CommandBufferTestBase *base): base(base) {} + + // On assignment, assume the object has a refcount of one. + clCommandBufferWrapper &operator=(cl_command_buffer_khr rhs) + { + reset(rhs); + return *this; + } + + // Copy semantics, increase retain count. + clCommandBufferWrapper(clCommandBufferWrapper const &w) { *this = w; } + clCommandBufferWrapper &operator=(clCommandBufferWrapper const &w) + { + reset(w.object); + retain(); + return *this; + } + + // Move semantics, directly take ownership. + clCommandBufferWrapper(clCommandBufferWrapper &&w) { *this = std::move(w); } + clCommandBufferWrapper &operator=(clCommandBufferWrapper &&w) + { + reset(w.object); + w.object = nullptr; + return *this; + } + + ~clCommandBufferWrapper() { reset(); } + + // Release the existing object, if any, and own the new one, if any. + void reset(cl_command_buffer_khr new_object = nullptr) + { + release(); + object = new_object; + } + + operator cl_command_buffer_khr() const { return object; } +}; + +#define CHECK_COMMAND_BUFFER_EXTENSION_AVAILABLE(device) \ + { \ + if (!is_extension_available(device, "cl_khr_command_buffer")) \ + { \ + log_info( \ + "Device does not support 'cl_khr_command_buffer'. Skipping " \ + "the test.\n"); \ + return TEST_SKIPPED_ITSELF; \ + } \ + } + + +#endif // _CL_KHR_COMMAND_BUFFER_TEST_BASE_H diff --git a/test_conformance/extensions/cl_khr_command_buffer/main.cpp b/test_conformance/extensions/cl_khr_command_buffer/main.cpp new file mode 100644 index 00000000..4dece455 --- /dev/null +++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp @@ -0,0 +1,35 @@ +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "harness/testHarness.h" + +test_definition test_list[] = { + ADD_TEST(single_ndrange), ADD_TEST(interleaved_enqueue), + ADD_TEST(mixed_commands), ADD_TEST(explicit_flush), + ADD_TEST(user_events), ADD_TEST(out_of_order) +}; + + +int main(int argc, const char *argv[]) +{ + // A device may report the required properties of a queue that + // is compatible with command-buffers via the query + // CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR. We account + // for this in the tests themselves, rather than here, where we have a + // device to query. + const cl_command_queue_properties queue_properties = 0; + return runTestHarnessWithCheck(argc, argv, ARRAY_SIZE(test_list), test_list, + false, queue_properties, nullptr); +} diff --git a/test_conformance/extensions/cl_khr_command_buffer/procs.h b/test_conformance/extensions/cl_khr_command_buffer/procs.h new file mode 100644 index 00000000..58fd228f --- /dev/null +++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h @@ -0,0 +1,35 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef _CL_KHR_COMMAND_BUFFER_PROCS_H +#define _CL_KHR_COMMAND_BUFFER_PROCS_H + +#include <CL/cl.h> + +// Basic command-buffer tests +extern int test_single_ndrange(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_interleaved_enqueue(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_mixed_commands(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_explicit_flush(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_user_events(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_out_of_order(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); + +#endif /*_CL_KHR_COMMAND_BUFFER_PROCS_H*/ diff --git a/test_conformance/gl/common.h b/test_conformance/gl/common.h index 36221da1..d8587cf0 100644 --- a/test_conformance/gl/common.h +++ b/test_conformance/gl/common.h @@ -32,8 +32,8 @@ struct format { }; // These are the typically tested formats. - -static struct format common_formats[] = { +// clang-format off +static const format common_formats[] = { #ifdef __APPLE__ { GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8, kUChar }, { GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar }, @@ -53,25 +53,30 @@ static struct format common_formats[] = { }; #ifdef GL_VERSION_3_2 -static struct format depth_formats[] = { +static const format depth_formats[] = { { GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, kUShort }, { GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, kFloat }, { GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, kUInt }, { GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV, kFloat }, }; #endif +// clang-format on int test_images_write_common(cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t* sizes, size_t nsizes ); + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, size_t ntargets, + sizevec_t *sizes, size_t nsizes); -int test_images_read_common( cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes ); +int test_images_read_common(cl_device_id device, cl_context context, + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, size_t ntargets, + sizevec_t *sizes, size_t nsizes); -int test_images_get_info_common( cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes ); +int test_images_get_info_common(cl_device_id device, cl_context context, + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, + size_t ntargets, sizevec_t *sizes, + size_t nsizes); int is_rgb_101010_supported( cl_context context, GLenum gl_target ); diff --git a/test_conformance/gl/test_buffers.cpp b/test_conformance/gl/test_buffers.cpp index 35f01ee6..c61610d0 100644 --- a/test_conformance/gl/test_buffers.cpp +++ b/test_conformance/gl/test_buffers.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,126 +17,126 @@ #include "harness/conversions.h" #include "harness/typeWrappers.h" -#if !defined (__APPLE__) - #include <CL/cl_gl.h> +#if !defined(__APPLE__) +#include <CL/cl_gl.h> #endif static const char *bufferKernelPattern = -"__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, __global %s%s *glDest )\n" -"{\n" -" int tid = get_global_id(0);\n" -" clDest[ tid ] = source[ tid ] + (%s%s)(1);\n" -" glDest[ tid ] = source[ tid ] + (%s%s)(2);\n" -"}\n"; - -#define TYPE_CASE( enum, type, range, offset ) \ - case enum: \ - { \ - cl_##type *ptr = (cl_##type *)outData; \ - for( i = 0; i < count; i++ ) \ - ptr[ i ] = (cl_##type)( ( genrand_int32(d) & range ) - offset ); \ - break; \ + "__kernel void sample_test( __global %s%s *source, __global %s%s *clDest, " + "__global %s%s *glDest )\n" + "{\n" + " int tid = get_global_id(0);\n" + " clDest[ tid ] = source[ tid ] + (%s%s)(1);\n" + " glDest[ tid ] = source[ tid ] + (%s%s)(2);\n" + "}\n"; + +#define TYPE_CASE(enum, type, range, offset) \ + case enum: { \ + cl_##type *ptr = (cl_##type *)outData; \ + for (i = 0; i < count; i++) \ + ptr[i] = (cl_##type)((genrand_int32(d) & range) - offset); \ + break; \ } -void gen_input_data( ExplicitType type, size_t count, MTdata d, void *outData ) +void gen_input_data(ExplicitType type, size_t count, MTdata d, void *outData) { size_t i; - switch( type ) + switch (type) { - case kBool: - { + case kBool: { bool *boolPtr = (bool *)outData; - for( i = 0; i < count; i++ ) + for (i = 0; i < count; i++) { - boolPtr[i] = ( genrand_int32(d) & 1 ) ? true : false; + boolPtr[i] = (genrand_int32(d) & 1) ? true : false; } break; } - TYPE_CASE( kChar, char, 250, 127 ) - TYPE_CASE( kUChar, uchar, 250, 0 ) - TYPE_CASE( kShort, short, 65530, 32767 ) - TYPE_CASE( kUShort, ushort, 65530, 0 ) - TYPE_CASE( kInt, int, 0x0fffffff, 0x70000000 ) - TYPE_CASE( kUInt, uint, 0x0fffffff, 0 ) + TYPE_CASE(kChar, char, 250, 127) + TYPE_CASE(kUChar, uchar, 250, 0) + TYPE_CASE(kShort, short, 65530, 32767) + TYPE_CASE(kUShort, ushort, 65530, 0) + TYPE_CASE(kInt, int, 0x0fffffff, 0x70000000) + TYPE_CASE(kUInt, uint, 0x0fffffff, 0) - case kLong: - { + case kLong: { cl_long *longPtr = (cl_long *)outData; - for( i = 0; i < count; i++ ) + for (i = 0; i < count; i++) { - longPtr[i] = (cl_long)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 ); + longPtr[i] = (cl_long)genrand_int32(d) + | ((cl_ulong)genrand_int32(d) << 32); } break; } - case kULong: - { + case kULong: { cl_ulong *ulongPtr = (cl_ulong *)outData; - for( i = 0; i < count; i++ ) + for (i = 0; i < count; i++) { - ulongPtr[i] = (cl_ulong)genrand_int32(d) | ( (cl_ulong)genrand_int32(d) << 32 ); + ulongPtr[i] = (cl_ulong)genrand_int32(d) + | ((cl_ulong)genrand_int32(d) << 32); } break; } - case kFloat: - { + case kFloat: { cl_float *floatPtr = (float *)outData; - for( i = 0; i < count; i++ ) - floatPtr[i] = get_random_float( -100000.f, 100000.f, d ); + for (i = 0; i < count; i++) + floatPtr[i] = get_random_float(-100000.f, 100000.f, d); break; } default: - log_error( "ERROR: Invalid type passed in to generate_random_data!\n" ); + log_error( + "ERROR: Invalid type passed in to generate_random_data!\n"); break; } } -#define INC_CASE( enum, type ) \ - case enum: \ - { \ - cl_##type *src = (cl_##type *)inData; \ - cl_##type *dst = (cl_##type *)outData; \ - *dst = *src + 1; \ - break; \ +#define INC_CASE(enum, type) \ + case enum: { \ + cl_##type *src = (cl_##type *)inData; \ + cl_##type *dst = (cl_##type *)outData; \ + *dst = *src + 1; \ + break; \ } -void get_incremented_value( void *inData, void *outData, ExplicitType type ) +void get_incremented_value(void *inData, void *outData, ExplicitType type) { - switch( type ) + switch (type) { - INC_CASE( kChar, char ) - INC_CASE( kUChar, uchar ) - INC_CASE( kShort, short ) - INC_CASE( kUShort, ushort ) - INC_CASE( kInt, int ) - INC_CASE( kUInt, uint ) - INC_CASE( kLong, long ) - INC_CASE( kULong, ulong ) - INC_CASE( kFloat, float ) - default: - break; + INC_CASE(kChar, char) + INC_CASE(kUChar, uchar) + INC_CASE(kShort, short) + INC_CASE(kUShort, ushort) + INC_CASE(kInt, int) + INC_CASE(kUInt, uint) + INC_CASE(kLong, long) + INC_CASE(kULong, ulong) + INC_CASE(kFloat, float) + default: break; } } -int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType vecType, size_t vecSize, int numElements, int validate_only, MTdata d) +int test_buffer_kernel(cl_context context, cl_command_queue queue, + ExplicitType vecType, size_t vecSize, int numElements, + int validate_only, MTdata d) { clProgramWrapper program; clKernelWrapper kernel; - clMemWrapper streams[ 3 ]; + clMemWrapper streams[3]; size_t dataSize = numElements * 16 * sizeof(cl_long); #if !(defined(_WIN32) && defined(_MSC_VER)) - cl_long inData[numElements * 16], outDataCL[numElements * 16], outDataGL[ numElements * 16 ]; + cl_long inData[numElements * 16], outDataCL[numElements * 16], + outDataGL[numElements * 16]; #else - cl_long* inData = (cl_long*)_malloca(dataSize); - cl_long* outDataCL = (cl_long*)_malloca(dataSize); - cl_long* outDataGL = (cl_long*)_malloca(dataSize); + cl_long *inData = (cl_long *)_malloca(dataSize); + cl_long *outDataCL = (cl_long *)_malloca(dataSize); + cl_long *outDataGL = (cl_long *)_malloca(dataSize); #endif glBufferWrapper inGLBuffer, outGLBuffer; - int i; + int i; size_t bufferSize; int error; @@ -146,210 +146,259 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType char sizeName[4]; /* Create the source */ - if( vecSize == 1 ) - sizeName[ 0 ] = 0; + if (vecSize == 1) + sizeName[0] = 0; else - sprintf( sizeName, "%d", (int)vecSize ); + sprintf(sizeName, "%d", (int)vecSize); - sprintf( kernelSource, bufferKernelPattern, get_explicit_type_name( vecType ), sizeName, - get_explicit_type_name( vecType ), sizeName, - get_explicit_type_name( vecType ), sizeName, - get_explicit_type_name( vecType ), sizeName, - get_explicit_type_name( vecType ), sizeName ); + sprintf(kernelSource, bufferKernelPattern, get_explicit_type_name(vecType), + sizeName, get_explicit_type_name(vecType), sizeName, + get_explicit_type_name(vecType), sizeName, + get_explicit_type_name(vecType), sizeName, + get_explicit_type_name(vecType), sizeName); /* Create kernels */ programPtr = kernelSource; - if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + (const char **)&programPtr, "sample_test")) { return -1; } - bufferSize = numElements * vecSize * get_explicit_type_size( vecType ); + bufferSize = numElements * vecSize * get_explicit_type_size(vecType); /* Generate some almost-random input data */ - gen_input_data( vecType, vecSize * numElements, d, inData ); - memset( outDataCL, 0, dataSize ); - memset( outDataGL, 0, dataSize ); + gen_input_data(vecType, vecSize * numElements, d, inData); + memset(outDataCL, 0, dataSize); + memset(outDataGL, 0, dataSize); /* Generate some GL buffers to go against */ - glGenBuffers( 1, &inGLBuffer ); - glGenBuffers( 1, &outGLBuffer ); + glGenBuffers(1, &inGLBuffer); + glGenBuffers(1, &outGLBuffer); - glBindBuffer( GL_ARRAY_BUFFER, inGLBuffer ); - glBufferData( GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW ); + glBindBuffer(GL_ARRAY_BUFFER, inGLBuffer); + glBufferData(GL_ARRAY_BUFFER, bufferSize, inData, GL_STATIC_DRAW); - // Note: we need to bind the output buffer, even though we don't care about its values yet, - // because CL needs it to get the buffer size - glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer ); - glBufferData( GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW ); + // Note: we need to bind the output buffer, even though we don't care about + // its values yet, because CL needs it to get the buffer size + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + glBufferData(GL_ARRAY_BUFFER, bufferSize, outDataGL, GL_STATIC_DRAW); - glBindBuffer( GL_ARRAY_BUFFER, 0 ); + glBindBuffer(GL_ARRAY_BUFFER, 0); glFinish(); - /* Generate some streams. The first and last ones are GL, middle one just vanilla CL */ - streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_ONLY, inGLBuffer, &error ); - test_error( error, "Unable to create input GL buffer" ); + /* Generate some streams. The first and last ones are GL, middle one just + * vanilla CL */ + streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_ONLY, + inGLBuffer, &error); + test_error(error, "Unable to create input GL buffer"); - streams[ 1 ] = clCreateBuffer( context, CL_MEM_READ_WRITE, bufferSize, NULL, &error ); - test_error( error, "Unable to create output CL buffer" ); + streams[1] = + clCreateBuffer(context, CL_MEM_READ_WRITE, bufferSize, NULL, &error); + test_error(error, "Unable to create output CL buffer"); - streams[ 2 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_WRITE_ONLY, outGLBuffer, &error ); - test_error( error, "Unable to create output GL buffer" ); + streams[2] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_WRITE_ONLY, + outGLBuffer, &error); + test_error(error, "Unable to create output GL buffer"); - /* Validate the info */ - if (validate_only) { - int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) | - CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) ); - for(i=0;i<3;i++) + /* Validate the info */ + if (validate_only) { - clReleaseMemObject(streams[i]); - streams[i] = NULL; - } + int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, + (GLuint)inGLBuffer, (GLenum)0, 0) + | CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, + (GLuint)outGLBuffer, (GLenum)0, 0)); + for (i = 0; i < 3; i++) + { + streams[i].reset(); + } - glDeleteBuffers(1, &inGLBuffer); inGLBuffer = 0; - glDeleteBuffers(1, &outGLBuffer); outGLBuffer = 0; + glDeleteBuffers(1, &inGLBuffer); + inGLBuffer = 0; + glDeleteBuffers(1, &outGLBuffer); + outGLBuffer = 0; - return result; - } + return result; + } /* Assign streams and execute */ - for( int i = 0; i < 3; i++ ) + for (int i = 0; i < 3; i++) { - error = clSetKernelArg( kernel, i, sizeof( streams[ i ] ), &streams[ i ] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, i, sizeof(streams[i]), &streams[i]); + test_error(error, "Unable to set kernel arguments"); } - error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL); - test_error( error, "Unable to acquire GL obejcts"); - error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL); - test_error( error, "Unable to acquire GL obejcts"); + error = + (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL); + test_error(error, "Unable to acquire GL obejcts"); + error = + (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL); + test_error(error, "Unable to acquire GL obejcts"); /* Run the kernel */ threads[0] = numElements; - error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); - test_error( error, "Unable to get work group size to use" ); - - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); - test_error( error, "Unable to execute test kernel" ); - - error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 0 ], 0, NULL, NULL ); - test_error(error, "clEnqueueReleaseGLObjects failed"); - error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &streams[ 2 ], 0, NULL, NULL ); - test_error(error, "clEnqueueReleaseGLObjects failed"); - - // Get the results from both CL and GL and make sure everything looks correct - error = clEnqueueReadBuffer( queue, streams[ 1 ], CL_TRUE, 0, bufferSize, outDataCL, 0, NULL, NULL ); - test_error( error, "Unable to read output CL array!" ); - - glBindBuffer( GL_ARRAY_BUFFER, outGLBuffer ); - void *glMem = glMapBuffer( GL_ARRAY_BUFFER, GL_READ_ONLY ); - memcpy( outDataGL, glMem, bufferSize ); - glUnmapBuffer( GL_ARRAY_BUFFER ); - - char *inP = (char *)inData, *glP = (char *)outDataGL, *clP = (char *)outDataCL; + error = get_max_common_work_group_size(context, kernel, threads[0], + &localThreads[0]); + test_error(error, "Unable to get work group size to use"); + + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, + localThreads, 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + + error = + (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[0], 0, NULL, NULL); + test_error(error, "clEnqueueReleaseGLObjects failed"); + error = + (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &streams[2], 0, NULL, NULL); + test_error(error, "clEnqueueReleaseGLObjects failed"); + + // Get the results from both CL and GL and make sure everything looks + // correct + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, bufferSize, + outDataCL, 0, NULL, NULL); + test_error(error, "Unable to read output CL array!"); + + glBindBuffer(GL_ARRAY_BUFFER, outGLBuffer); + void *glMem = glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY); + memcpy(outDataGL, glMem, bufferSize); + glUnmapBuffer(GL_ARRAY_BUFFER); + + char *inP = (char *)inData, *glP = (char *)outDataGL, + *clP = (char *)outDataCL; error = 0; - for( size_t i = 0; i < numElements * vecSize; i++ ) + for (size_t i = 0; i < numElements * vecSize; i++) { cl_long expectedCLValue, expectedGLValue; - get_incremented_value( inP, &expectedCLValue, vecType ); - get_incremented_value( &expectedCLValue, &expectedGLValue, vecType ); + get_incremented_value(inP, &expectedCLValue, vecType); + get_incremented_value(&expectedCLValue, &expectedGLValue, vecType); - if( memcmp( clP, &expectedCLValue, get_explicit_type_size( vecType ) ) != 0 ) + if (memcmp(clP, &expectedCLValue, get_explicit_type_size(vecType)) != 0) { - char scratch[ 64 ]; - log_error( "ERROR: Data sample %d from the CL output did not validate!\n", (int)i ); - log_error( "\t Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) ); - log_error( "\tExpected: %s\n", GetDataVectorString( &expectedCLValue, get_explicit_type_size( vecType ), 1, scratch ) ); - log_error( "\t Actual: %s\n", GetDataVectorString( clP, get_explicit_type_size( vecType ), 1, scratch ) ); + char scratch[64]; + log_error( + "ERROR: Data sample %d from the CL output did not validate!\n", + (int)i); + log_error("\t Input: %s\n", + GetDataVectorString(inP, get_explicit_type_size(vecType), + 1, scratch)); + log_error("\tExpected: %s\n", + GetDataVectorString(&expectedCLValue, + get_explicit_type_size(vecType), 1, + scratch)); + log_error("\t Actual: %s\n", + GetDataVectorString(clP, get_explicit_type_size(vecType), + 1, scratch)); error = -1; } - if( memcmp( glP, &expectedGLValue, get_explicit_type_size( vecType ) ) != 0 ) + if (memcmp(glP, &expectedGLValue, get_explicit_type_size(vecType)) != 0) { - char scratch[ 64 ]; - log_error( "ERROR: Data sample %d from the GL output did not validate!\n", (int)i ); - log_error( "\t Input: %s\n", GetDataVectorString( inP, get_explicit_type_size( vecType ), 1, scratch ) ); - log_error( "\tExpected: %s\n", GetDataVectorString( &expectedGLValue, get_explicit_type_size( vecType ), 1, scratch ) ); - log_error( "\t Actual: %s\n", GetDataVectorString( glP, get_explicit_type_size( vecType ), 1, scratch ) ); + char scratch[64]; + log_error( + "ERROR: Data sample %d from the GL output did not validate!\n", + (int)i); + log_error("\t Input: %s\n", + GetDataVectorString(inP, get_explicit_type_size(vecType), + 1, scratch)); + log_error("\tExpected: %s\n", + GetDataVectorString(&expectedGLValue, + get_explicit_type_size(vecType), 1, + scratch)); + log_error("\t Actual: %s\n", + GetDataVectorString(glP, get_explicit_type_size(vecType), + 1, scratch)); error = -1; } - if( error ) - return error; + if (error) return error; - inP += get_explicit_type_size( vecType ); - glP += get_explicit_type_size( vecType ); - clP += get_explicit_type_size( vecType ); + inP += get_explicit_type_size(vecType); + glP += get_explicit_type_size(vecType); + clP += get_explicit_type_size(vecType); } - for(i=0;i<3;i++) + for (i = 0; i < 3; i++) { - clReleaseMemObject(streams[i]); - streams[i] = NULL; + streams[i].reset(); } - glDeleteBuffers(1, &inGLBuffer); inGLBuffer = 0; - glDeleteBuffers(1, &outGLBuffer); outGLBuffer = 0; + glDeleteBuffers(1, &inGLBuffer); + inGLBuffer = 0; + glDeleteBuffers(1, &outGLBuffer); + outGLBuffer = 0; return 0; } -int test_buffers( cl_device_id device, cl_context context, cl_command_queue queue, int numElements ) +int test_buffers(cl_device_id device, cl_context context, + cl_command_queue queue, int numElements) { - ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes }; + ExplicitType vecType[] = { + kChar, kUChar, kShort, kUShort, kInt, + kUInt, kLong, kULong, kFloat, kNumExplicitTypes + }; unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 }; unsigned int index, typeIndex; int retVal = 0; RandomSeed seed(gRandomSeed); - for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ ) + for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++) { - for( index = 0; vecSizes[ index ] != 0; index++ ) + for (index = 0; vecSizes[index] != 0; index++) { // Test! - if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 0, seed) != 0 ) + if (test_buffer_kernel(context, queue, vecType[typeIndex], + vecSizes[index], numElements, 0, seed) + != 0) { - char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; - log_error( " Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] ); + char sizeNames[][4] = { "", "", "2", "", "4", "", "", "", "8", + "", "", "", "", "", "", "", "16" }; + log_error(" Buffer test %s%s FAILED\n", + get_explicit_type_name(vecType[typeIndex]), + sizeNames[vecSizes[index]]); retVal++; } } } return retVal; - } -int test_buffers_getinfo( cl_device_id device, cl_context context, cl_command_queue queue, int numElements ) +int test_buffers_getinfo(cl_device_id device, cl_context context, + cl_command_queue queue, int numElements) { - ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kNumExplicitTypes }; + ExplicitType vecType[] = { + kChar, kUChar, kShort, kUShort, kInt, + kUInt, kLong, kULong, kFloat, kNumExplicitTypes + }; unsigned int vecSizes[] = { 1, 2, 4, 8, 16, 0 }; unsigned int index, typeIndex; int retVal = 0; - RandomSeed seed( gRandomSeed ); + RandomSeed seed(gRandomSeed); - for( typeIndex = 0; vecType[ typeIndex ] != kNumExplicitTypes; typeIndex++ ) + for (typeIndex = 0; vecType[typeIndex] != kNumExplicitTypes; typeIndex++) { - for( index = 0; vecSizes[ index ] != 0; index++ ) + for (index = 0; vecSizes[index] != 0; index++) { // Test! - if( test_buffer_kernel( context, queue, vecType[ typeIndex ], vecSizes[ index ], numElements, 1, seed ) != 0 ) + if (test_buffer_kernel(context, queue, vecType[typeIndex], + vecSizes[index], numElements, 1, seed) + != 0) { - char sizeNames[][ 4 ] = { "", "", "2", "", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; - log_error( " Buffer test %s%s FAILED\n", get_explicit_type_name( vecType[ typeIndex ] ), sizeNames[ vecSizes[ index ] ] ); + char sizeNames[][4] = { "", "", "2", "", "4", "", "", "", "8", + "", "", "", "", "", "", "", "16" }; + log_error(" Buffer test %s%s FAILED\n", + get_explicit_type_name(vecType[typeIndex]), + sizeNames[vecSizes[index]]); retVal++; } } } return retVal; - } - - - diff --git a/test_conformance/gl/test_fence_sync.cpp b/test_conformance/gl/test_fence_sync.cpp index 00bf2cc9..35cc62de 100644 --- a/test_conformance/gl/test_fence_sync.cpp +++ b/test_conformance/gl/test_fence_sync.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,7 +17,7 @@ #include "gl/setup.h" #include "harness/genericThread.h" -#if defined( __APPLE__ ) +#if defined(__APPLE__) #include <OpenGL/glu.h> #else #include <GL/glu.h> @@ -40,112 +40,121 @@ typedef struct __GLsync *GLsync; #define APIENTRY #endif -typedef GLsync (APIENTRY *glFenceSyncPtr)(GLenum condition,GLbitfield flags); +typedef GLsync(APIENTRY *glFenceSyncPtr)(GLenum condition, GLbitfield flags); glFenceSyncPtr glFenceSyncFunc; -typedef bool (APIENTRY *glIsSyncPtr)(GLsync sync); +typedef bool(APIENTRY *glIsSyncPtr)(GLsync sync); glIsSyncPtr glIsSyncFunc; -typedef void (APIENTRY *glDeleteSyncPtr)(GLsync sync); +typedef void(APIENTRY *glDeleteSyncPtr)(GLsync sync); glDeleteSyncPtr glDeleteSyncFunc; -typedef GLenum (APIENTRY *glClientWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout); +typedef GLenum(APIENTRY *glClientWaitSyncPtr)(GLsync sync, GLbitfield flags, + GLuint64 timeout); glClientWaitSyncPtr glClientWaitSyncFunc; -typedef void (APIENTRY *glWaitSyncPtr)(GLsync sync,GLbitfield flags,GLuint64 timeout); +typedef void(APIENTRY *glWaitSyncPtr)(GLsync sync, GLbitfield flags, + GLuint64 timeout); glWaitSyncPtr glWaitSyncFunc; -typedef void (APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params); +typedef void(APIENTRY *glGetInteger64vPtr)(GLenum pname, GLint64 *params); glGetInteger64vPtr glGetInteger64vFunc; -typedef void (APIENTRY *glGetSyncivPtr)(GLsync sync,GLenum pname,GLsizei bufSize,GLsizei *length, - GLint *values); +typedef void(APIENTRY *glGetSyncivPtr)(GLsync sync, GLenum pname, + GLsizei bufSize, GLsizei *length, + GLint *values); glGetSyncivPtr glGetSyncivFunc; #define CHK_GL_ERR() printf("%s\n", gluErrorString(glGetError())) -static void InitSyncFns( void ) +static void InitSyncFns(void) { - glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress( "glFenceSync" ); - glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress( "glIsSync" ); - glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress( "glDeleteSync" ); - glClientWaitSyncFunc = (glClientWaitSyncPtr)glutGetProcAddress( "glClientWaitSync" ); - glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress( "glWaitSync" ); - glGetInteger64vFunc = (glGetInteger64vPtr)glutGetProcAddress( "glGetInteger64v" ); - glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress( "glGetSynciv" ); + glFenceSyncFunc = (glFenceSyncPtr)glutGetProcAddress("glFenceSync"); + glIsSyncFunc = (glIsSyncPtr)glutGetProcAddress("glIsSync"); + glDeleteSyncFunc = (glDeleteSyncPtr)glutGetProcAddress("glDeleteSync"); + glClientWaitSyncFunc = + (glClientWaitSyncPtr)glutGetProcAddress("glClientWaitSync"); + glWaitSyncFunc = (glWaitSyncPtr)glutGetProcAddress("glWaitSync"); + glGetInteger64vFunc = + (glGetInteger64vPtr)glutGetProcAddress("glGetInteger64v"); + glGetSyncivFunc = (glGetSyncivPtr)glutGetProcAddress("glGetSynciv"); } #ifndef GL_ARB_sync -#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111 +#define GL_MAX_SERVER_WAIT_TIMEOUT 0x9111 -#define GL_OBJECT_TYPE 0x9112 -#define GL_SYNC_CONDITION 0x9113 -#define GL_SYNC_STATUS 0x9114 -#define GL_SYNC_FLAGS 0x9115 +#define GL_OBJECT_TYPE 0x9112 +#define GL_SYNC_CONDITION 0x9113 +#define GL_SYNC_STATUS 0x9114 +#define GL_SYNC_FLAGS 0x9115 -#define GL_SYNC_FENCE 0x9116 +#define GL_SYNC_FENCE 0x9116 -#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117 +#define GL_SYNC_GPU_COMMANDS_COMPLETE 0x9117 -#define GL_UNSIGNALED 0x9118 -#define GL_SIGNALED 0x9119 +#define GL_UNSIGNALED 0x9118 +#define GL_SIGNALED 0x9119 -#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001 +#define GL_SYNC_FLUSH_COMMANDS_BIT 0x00000001 -#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull +#define GL_TIMEOUT_IGNORED 0xFFFFFFFFFFFFFFFFull -#define GL_ALREADY_SIGNALED 0x911A -#define GL_TIMEOUT_EXPIRED 0x911B -#define GL_CONDITION_SATISFIED 0x911C -#define GL_WAIT_FAILED 0x911D +#define GL_ALREADY_SIGNALED 0x911A +#define GL_TIMEOUT_EXPIRED 0x911B +#define GL_CONDITION_SATISFIED 0x911C +#define GL_WAIT_FAILED 0x911D #endif #define USING_ARB_sync 1 #endif -typedef cl_event (CL_API_CALL *clCreateEventFromGLsyncKHR_fn)( cl_context context, GLsync sync, cl_int *errCode_ret) ; +typedef cl_event(CL_API_CALL *clCreateEventFromGLsyncKHR_fn)( + cl_context context, GLsync sync, cl_int *errCode_ret); clCreateEventFromGLsyncKHR_fn clCreateEventFromGLsyncKHR_ptr; static const char *updateBuffersKernel[] = { - "__kernel void update( __global float4 * vertices, __global float4 *colors, int horizWrap, int rowIdx )\n" + "__kernel void update( __global float4 * vertices, __global float4 " + "*colors, int horizWrap, int rowIdx )\n" "{\n" " size_t tid = get_global_id(0);\n" "\n" " size_t xVal = ( tid & ( horizWrap - 1 ) );\n" " vertices[ tid * 2 + 0 ] = (float4)( xVal, rowIdx*16.f, 0.0f, 1.f );\n" - " vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, 1.f );\n" + " vertices[ tid * 2 + 1 ] = (float4)( xVal, rowIdx*16.f + 4.0f, 0.0f, " + "1.f );\n" "\n" " int rowV = rowIdx + 1;\n" - " colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 ) >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n" - " //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, 1.0f, 1.0f, 1.0f );\n" + " colors[ tid * 2 + 0 ] = (float4)( ( rowV & 1 ) / 255.f, ( ( rowV & 2 " + ") >> 1 ) / 255.f, ( ( rowV & 4 ) >> 2 ) / 255.f, 1.f );\n" + " //colors[ tid * 2 + 0 ] = (float4)( (float)xVal/(float)horizWrap, " + "1.0f, 1.0f, 1.0f );\n" " colors[ tid * 2 + 1 ] = colors[ tid * 2 + 0 ];\n" - "}\n" }; - -//Passthrough VertexShader -static const char *vertexshader = -"#version 150\n" -"uniform mat4 projMatrix;\n" -"in vec4 inPosition;\n" -"in vec4 inColor;\n" -"out vec4 vertColor;\n" -"void main (void) {\n" -" gl_Position = projMatrix*inPosition;\n" -" vertColor = inColor;\n" -"}\n"; - -//Passthrough FragmentShader -static const char *fragmentshader = -"#version 150\n" -"in vec4 vertColor;\n" -"out vec4 outColor;\n" -"void main (void) {\n" -" outColor = vertColor;\n" -"}\n"; + "}\n" +}; + +// Passthrough VertexShader +static const char *vertexshader = "#version 150\n" + "uniform mat4 projMatrix;\n" + "in vec4 inPosition;\n" + "in vec4 inColor;\n" + "out vec4 vertColor;\n" + "void main (void) {\n" + " gl_Position = projMatrix*inPosition;\n" + " vertColor = inColor;\n" + "}\n"; + +// Passthrough FragmentShader +static const char *fragmentshader = "#version 150\n" + "in vec4 vertColor;\n" + "out vec4 outColor;\n" + "void main (void) {\n" + " outColor = vertColor;\n" + "}\n"; GLuint createShaderProgram(GLint *posLoc, GLint *colLoc) { - GLint logLength, status; + GLint logLength, status; GLuint program = glCreateProgram(); GLuint vpShader; @@ -153,8 +162,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc) glShaderSource(vpShader, 1, (const GLchar **)&vertexshader, NULL); glCompileShader(vpShader); glGetShaderiv(vpShader, GL_INFO_LOG_LENGTH, &logLength); - if (logLength > 0) { - GLchar *log = (GLchar*) malloc(logLength); + if (logLength > 0) + { + GLchar *log = (GLchar *)malloc(logLength); glGetShaderInfoLog(vpShader, logLength, &logLength, log); log_info("Vtx Shader compile log:\n%s", log); free(log); @@ -175,8 +185,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc) glCompileShader(fpShader); glGetShaderiv(fpShader, GL_INFO_LOG_LENGTH, &logLength); - if (logLength > 0) { - GLchar *log = (GLchar*)malloc(logLength); + if (logLength > 0) + { + GLchar *log = (GLchar *)malloc(logLength); glGetShaderInfoLog(fpShader, logLength, &logLength, log); log_info("Frag Shader compile log:\n%s", log); free(log); @@ -192,8 +203,9 @@ GLuint createShaderProgram(GLint *posLoc, GLint *colLoc) glLinkProgram(program); glGetProgramiv(program, GL_INFO_LOG_LENGTH, &logLength); - if (logLength > 0) { - GLchar *log = (GLchar*)malloc(logLength); + if (logLength > 0) + { + GLchar *log = (GLchar *)malloc(logLength); glGetProgramInfoLog(program, logLength, &logLength, log); log_info("Program link log:\n%s", log); free(log); @@ -219,7 +231,7 @@ void destroyShaderProgram(GLuint program) glUseProgram(0); glGetAttachedShaders(program, 2, &count, shaders); int i; - for(i = 0; i < count; i++) + for (i = 0; i < count; i++) { glDetachShader(program, shaders[i]); glDeleteShader(shaders[i]); @@ -227,44 +239,49 @@ void destroyShaderProgram(GLuint program) glDeleteProgram(program); } -// This function queues up and runs the above CL kernel that writes the vertex data -cl_int run_cl_kernel( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1, - cl_int rowIdx, cl_event fenceEvent, size_t numThreads ) +// This function queues up and runs the above CL kernel that writes the vertex +// data +cl_int run_cl_kernel(cl_kernel kernel, cl_command_queue queue, cl_mem stream0, + cl_mem stream1, cl_int rowIdx, cl_event fenceEvent, + size_t numThreads) { - cl_int error = clSetKernelArg( kernel, 3, sizeof( rowIdx ), &rowIdx ); - test_error( error, "Unable to set kernel arguments" ); + cl_int error = clSetKernelArg(kernel, 3, sizeof(rowIdx), &rowIdx); + test_error(error, "Unable to set kernel arguments"); clEventWrapper acqEvent1, acqEvent2, kernEvent, relEvent1, relEvent2; - int numEvents = ( fenceEvent != NULL ) ? 1 : 0; - cl_event *fence_evt = ( fenceEvent != NULL ) ? &fenceEvent : NULL; + int numEvents = (fenceEvent != NULL) ? 1 : 0; + cl_event *fence_evt = (fenceEvent != NULL) ? &fenceEvent : NULL; - error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream0, numEvents, fence_evt, &acqEvent1 ); - test_error( error, "Unable to acquire GL obejcts"); - error = (*clEnqueueAcquireGLObjects_ptr)( queue, 1, &stream1, numEvents, fence_evt, &acqEvent2 ); - test_error( error, "Unable to acquire GL obejcts"); + error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream0, numEvents, + fence_evt, &acqEvent1); + test_error(error, "Unable to acquire GL obejcts"); + error = (*clEnqueueAcquireGLObjects_ptr)(queue, 1, &stream1, numEvents, + fence_evt, &acqEvent2); + test_error(error, "Unable to acquire GL obejcts"); - cl_event evts[ 2 ] = { acqEvent1, acqEvent2 }; + cl_event evts[2] = { acqEvent1, acqEvent2 }; - error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &numThreads, NULL, 2, evts, &kernEvent ); - test_error( error, "Unable to execute test kernel" ); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &numThreads, NULL, 2, + evts, &kernEvent); + test_error(error, "Unable to execute test kernel"); - error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream0, 1, &kernEvent, &relEvent1 ); + error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream0, 1, &kernEvent, + &relEvent1); test_error(error, "clEnqueueReleaseGLObjects failed"); - error = (*clEnqueueReleaseGLObjects_ptr)( queue, 1, &stream1, 1, &kernEvent, &relEvent2 ); + error = (*clEnqueueReleaseGLObjects_ptr)(queue, 1, &stream1, 1, &kernEvent, + &relEvent2); test_error(error, "clEnqueueReleaseGLObjects failed"); - evts[ 0 ] = relEvent1; - evts[ 1 ] = relEvent2; - error = clWaitForEvents( 2, evts ); - test_error( error, "Unable to wait for release events" ); + evts[0] = relEvent1; + evts[1] = relEvent2; + error = clWaitForEvents(2, evts); + test_error(error, "Unable to wait for release events"); return 0; } -class RunThread : public genericThread -{ +class RunThread : public genericThread { public: - cl_kernel mKernel; cl_command_queue mQueue; cl_mem mStream0, mStream1; @@ -272,34 +289,40 @@ public: cl_event mFenceEvent; size_t mNumThreads; - RunThread( cl_kernel kernel, cl_command_queue queue, cl_mem stream0, cl_mem stream1, size_t numThreads ) - : mKernel( kernel ), mQueue( queue ), mStream0( stream0 ), mStream1( stream1 ), mNumThreads( numThreads ) - { - } + RunThread(cl_kernel kernel, cl_command_queue queue, cl_mem stream0, + cl_mem stream1, size_t numThreads) + : mKernel(kernel), mQueue(queue), mStream0(stream0), mStream1(stream1), + mNumThreads(numThreads) + {} - void SetRunData( cl_int rowIdx, cl_event fenceEvent ) + void SetRunData(cl_int rowIdx, cl_event fenceEvent) { mRowIdx = rowIdx; mFenceEvent = fenceEvent; } - virtual void * IRun( void ) + virtual void *IRun(void) { - cl_int error = run_cl_kernel( mKernel, mQueue, mStream0, mStream1, mRowIdx, mFenceEvent, mNumThreads ); + cl_int error = run_cl_kernel(mKernel, mQueue, mStream0, mStream1, + mRowIdx, mFenceEvent, mNumThreads); return (void *)(uintptr_t)error; } }; -int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_queue queue, bool separateThreads, GLint rend_vs, GLint read_vs, cl_device_id rend_device ) +int test_fence_sync_single(cl_device_id device, cl_context context, + cl_command_queue queue, bool separateThreads, + GLint rend_vs, GLint read_vs, + cl_device_id rend_device) { int error; const int framebufferSize = 512; - if( !is_extension_available( device, "cl_khr_gl_event" ) ) + if (!is_extension_available(device, "cl_khr_gl_event")) { - log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" ); + log_info("NOTE: cl_khr_gl_event extension not present on this device; " + "skipping fence sync test\n"); return 0; } @@ -312,10 +335,11 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ clGetPlatformIDs(0, NULL, &nplatforms); clGetPlatformIDs(1, &platform, NULL); - if (nplatforms > 1) { + if (nplatforms > 1) + { log_info("clGetPlatformIDs returned multiple values. This is not " - "an error, but might result in obtaining incorrect function " - "pointers if you do not want the first returned platform.\n"); + "an error, but might result in obtaining incorrect function " + "pointers if you do not want the first returned platform.\n"); // Show them the platform name, in case it is a problem. @@ -323,28 +347,35 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ char *name; clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &size); - name = (char*)malloc(size); + name = (char *)malloc(size); clGetPlatformInfo(platform, CL_PLATFORM_NAME, size, name, NULL); log_info("Using platform with name: %s \n", name); free(name); } - clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncKHR"); - if( clCreateEventFromGLsyncKHR_ptr == NULL ) + clCreateEventFromGLsyncKHR_ptr = + (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform( + platform, "clCreateEventFromGLsyncKHR"); + if (clCreateEventFromGLsyncKHR_ptr == NULL) { - log_error( "ERROR: Unable to run fence_sync test (clCreateEventFromGLsyncKHR function not discovered!)\n" ); - clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn)clGetExtensionFunctionAddressForPlatform(platform, "clCreateEventFromGLsyncAPPLE"); + log_error("ERROR: Unable to run fence_sync test " + "(clCreateEventFromGLsyncKHR function not discovered!)\n"); + clCreateEventFromGLsyncKHR_ptr = (clCreateEventFromGLsyncKHR_fn) + clGetExtensionFunctionAddressForPlatform( + platform, "clCreateEventFromGLsyncAPPLE"); return -1; } #ifdef USING_ARB_sync - char *gl_version_str = (char*)glGetString( GL_VERSION ); + char *gl_version_str = (char *)glGetString(GL_VERSION); float glCoreVersion; sscanf(gl_version_str, "%f", &glCoreVersion); - if( glCoreVersion < 3.0f ) + if (glCoreVersion < 3.0f) { - log_info( "OpenGL version %f does not support fence/sync! Skipping test.\n", glCoreVersion ); + log_info( + "OpenGL version %f does not support fence/sync! Skipping test.\n", + glCoreVersion); return 0; } @@ -354,10 +385,13 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ GLint val, screen; CGLGetVirtualScreen(currCtx, &screen); CGLDescribePixelFormat(pixFmt, screen, kCGLPFAOpenGLProfile, &val); - if(val != kCGLOGLPVersion_3_2_Core) + if (val != kCGLOGLPVersion_3_2_Core) { - log_error( "OpenGL context was not created with OpenGL version >= 3.0 profile even though platform supports it" - "OpenGL profile %f does not support fence/sync! Skipping test.\n", glCoreVersion ); + log_error( + "OpenGL context was not created with OpenGL version >= 3.0 profile " + "even though platform supports it" + "OpenGL profile %f does not support fence/sync! Skipping test.\n", + glCoreVersion); return -1; } #else @@ -365,7 +399,7 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ HDC hdc = wglGetCurrentDC(); HGLRC hglrc = wglGetCurrentContext(); #else - Display* dpy = glXGetCurrentDisplay(); + Display *dpy = glXGetCurrentDisplay(); GLXDrawable drawable = glXGetCurrentDrawable(); GLXContext ctx = glXGetCurrentContext(); #endif @@ -386,51 +420,66 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ GLint posLoc, colLoc; GLuint shaderprogram = createShaderProgram(&posLoc, &colLoc); - if(!shaderprogram) + if (!shaderprogram) { log_error("Failed to create shader program\n"); return -1; } - float l = 0.0f; float r = framebufferSize; - float b = 0.0f; float t = framebufferSize; - - float projMatrix[16] = { 2.0f/(r-l), 0.0f, 0.0f, 0.0f, - 0.0f, 2.0f/(t-b), 0.0f, 0.0f, - 0.0f, 0.0f, -1.0f, 0.0f, - -(r+l)/(r-l), -(t+b)/(t-b), 0.0f, 1.0f - }; + float l = 0.0f; + float r = framebufferSize; + float b = 0.0f; + float t = framebufferSize; + + float projMatrix[16] = { 2.0f / (r - l), + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 2.0f / (t - b), + 0.0f, + 0.0f, + 0.0f, + 0.0f, + -1.0f, + 0.0f, + -(r + l) / (r - l), + -(t + b) / (t - b), + 0.0f, + 1.0f }; glUseProgram(shaderprogram); GLuint projMatLoc = glGetUniformLocation(shaderprogram, "projMatrix"); glUniformMatrix4fv(projMatLoc, 1, 0, projMatrix); glUseProgram(0); - // Note: the framebuffer is just the target to verify our results against, so we don't - // really care to go through all the possible formats in this case + // Note: the framebuffer is just the target to verify our results against, + // so we don't really care to go through all the possible formats in this + // case glFramebufferWrapper glFramebuffer; glRenderbufferWrapper glRenderbuffer; - error = CreateGLRenderbufferRaw( framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT, - GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, - &glFramebuffer, &glRenderbuffer ); - if( error != 0 ) - return error; + error = CreateGLRenderbufferRaw( + framebufferSize, 128, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA, GL_RGBA, + GL_UNSIGNED_INT_8_8_8_8_REV, &glFramebuffer, &glRenderbuffer); + if (error != 0) return error; GLuint vao; glGenVertexArrays(1, &vao); glBindVertexArray(vao); glBufferWrapper vtxBuffer, colorBuffer; - glGenBuffers( 1, &vtxBuffer ); - glGenBuffers( 1, &colorBuffer ); + glGenBuffers(1, &vtxBuffer); + glGenBuffers(1, &colorBuffer); - const int numHorizVertices = ( framebufferSize * 64 ) + 1; + const int numHorizVertices = (framebufferSize * 64) + 1; - glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer ); - glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW ); + glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer); + glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4, + NULL, GL_STATIC_DRAW); - glBindBuffer( GL_ARRAY_BUFFER, colorBuffer ); - glBufferData( GL_ARRAY_BUFFER, sizeof( GLfloat ) * numHorizVertices * 2 * 4, NULL, GL_STATIC_DRAW ); + glBindBuffer(GL_ARRAY_BUFFER, colorBuffer); + glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat) * numHorizVertices * 2 * 4, + NULL, GL_STATIC_DRAW); // Now that the requisite objects are bound, we can attempt program // validation: @@ -439,8 +488,9 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ GLint logLength, status; glGetProgramiv(shaderprogram, GL_INFO_LOG_LENGTH, &logLength); - if (logLength > 0) { - GLchar *log = (GLchar*)malloc(logLength); + if (logLength > 0) + { + GLchar *log = (GLchar *)malloc(logLength); glGetProgramInfoLog(shaderprogram, logLength, &logLength, log); log_info("Program validate log:\n%s", log); free(log); @@ -455,125 +505,131 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ clProgramWrapper program; clKernelWrapper kernel; - clMemWrapper streams[ 2 ]; + clMemWrapper streams[2]; - if( create_single_kernel_helper( context, &program, &kernel, 1, updateBuffersKernel, "update" ) ) + if (create_single_kernel_helper(context, &program, &kernel, 1, + updateBuffersKernel, "update")) return -1; - streams[ 0 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, vtxBuffer, &error ); - test_error( error, "Unable to create CL buffer from GL vertex buffer" ); + streams[0] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE, + vtxBuffer, &error); + test_error(error, "Unable to create CL buffer from GL vertex buffer"); - streams[ 1 ] = (*clCreateFromGLBuffer_ptr)( context, CL_MEM_READ_WRITE, colorBuffer, &error ); - test_error( error, "Unable to create CL buffer from GL color buffer" ); + streams[1] = (*clCreateFromGLBuffer_ptr)(context, CL_MEM_READ_WRITE, + colorBuffer, &error); + test_error(error, "Unable to create CL buffer from GL color buffer"); - error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[ 0 ] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set kernel arguments"); - error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[ 1 ] ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set kernel arguments"); cl_int horizWrap = (cl_int)framebufferSize; - error = clSetKernelArg( kernel, 2, sizeof( horizWrap ), &horizWrap ); - test_error( error, "Unable to set kernel arguments" ); + error = clSetKernelArg(kernel, 2, sizeof(horizWrap), &horizWrap); + test_error(error, "Unable to set kernel arguments"); - glViewport( 0, 0, framebufferSize, framebufferSize ); - glClearColor( 0, 0, 0, 0 ); - glClear( GL_COLOR_BUFFER_BIT ); - glClear( GL_DEPTH_BUFFER_BIT ); - glDisable( GL_DEPTH_TEST ); - glEnable( GL_BLEND ); - glBlendFunc( GL_ONE, GL_ONE ); + glViewport(0, 0, framebufferSize, framebufferSize); + glClearColor(0, 0, 0, 0); + glClear(GL_COLOR_BUFFER_BIT); + glClear(GL_DEPTH_BUFFER_BIT); + glDisable(GL_DEPTH_TEST); + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE); clEventWrapper fenceEvent; GLsync glFence = 0; // Do a loop through 8 different horizontal stripes against the framebuffer - RunThread thread( kernel, queue, streams[ 0 ], streams[ 1 ], (size_t)numHorizVertices ); + RunThread thread(kernel, queue, streams[0], streams[1], + (size_t)numHorizVertices); - for( int i = 0; i < 8; i++ ) + for (int i = 0; i < 8; i++) { // if current rendering device is not the compute device and // separateThreads == false which means compute is going on same // thread and we are using implicit synchronization (no GLSync obj used) - // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we need - // to wait for rendering to finish on other device before CL can start - // writing to CL/GL shared mem objects. When separateThreads is true i.e. - // we are using GLSync obj to synchronize then we dont need to call glFinish - // here since CL should wait for rendering on other device before this - // GLSync object to finish before it starts writing to shared mem object. - // Also rend_device == compute_device no need to call glFinish - if(rend_device != device && !separateThreads) - glFinish(); - - if( separateThreads ) + // then glFlush by clEnqueueAcquireGLObject is not sufficient ... we + // need to wait for rendering to finish on other device before CL can + // start writing to CL/GL shared mem objects. When separateThreads is + // true i.e. we are using GLSync obj to synchronize then we dont need to + // call glFinish here since CL should wait for rendering on other device + // before this GLSync object to finish before it starts writing to + // shared mem object. Also rend_device == compute_device no need to call + // glFinish + if (rend_device != device && !separateThreads) glFinish(); + + if (separateThreads) { - if (fenceEvent != NULL) - { - clReleaseEvent(fenceEvent); - glDeleteSyncFunc(glFence); - } + glDeleteSyncFunc(glFence); glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); - fenceEvent = clCreateEventFromGLsyncKHR_ptr(context, glFence, &error); + fenceEvent = + clCreateEventFromGLsyncKHR_ptr(context, glFence, &error); test_error(error, "Unable to create CL event from GL fence"); - // in case of explicit synchronization, we just wait for the sync object to complete - // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility - // to flush on the context on which glSync is created + // in case of explicit synchronization, we just wait for the sync + // object to complete in clEnqueueAcquireGLObject but we dont flush. + // Its application's responsibility to flush on the context on which + // glSync is created glFlush(); - thread.SetRunData( (cl_int)i, fenceEvent ); + thread.SetRunData((cl_int)i, fenceEvent); thread.Start(); error = (cl_int)(size_t)thread.Join(); } else { - error = run_cl_kernel( kernel, queue, streams[ 0 ], streams[ 1 ], (cl_int)i, fenceEvent, (size_t)numHorizVertices ); + error = + run_cl_kernel(kernel, queue, streams[0], streams[1], (cl_int)i, + fenceEvent, (size_t)numHorizVertices); } - test_error( error, "Unable to run CL kernel" ); + test_error(error, "Unable to run CL kernel"); glUseProgram(shaderprogram); glEnableVertexAttribArray(posLoc); glEnableVertexAttribArray(colLoc); - glBindBuffer( GL_ARRAY_BUFFER, vtxBuffer ); - glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0); - glBindBuffer( GL_ARRAY_BUFFER, colorBuffer ); - glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE, 4*sizeof(GLfloat), 0); - glBindBuffer( GL_ARRAY_BUFFER, 0 ); + glBindBuffer(GL_ARRAY_BUFFER, vtxBuffer); + glVertexAttribPointer(posLoc, 4, GL_FLOAT, GL_FALSE, + 4 * sizeof(GLfloat), 0); + glBindBuffer(GL_ARRAY_BUFFER, colorBuffer); + glVertexAttribPointer(colLoc, 4, GL_FLOAT, GL_FALSE, + 4 * sizeof(GLfloat), 0); + glBindBuffer(GL_ARRAY_BUFFER, 0); - glDrawArrays( GL_TRIANGLE_STRIP, 0, numHorizVertices * 2 ); + glDrawArrays(GL_TRIANGLE_STRIP, 0, numHorizVertices * 2); glDisableVertexAttribArray(posLoc); glDisableVertexAttribArray(colLoc); glUseProgram(0); - if( separateThreads ) + if (separateThreads) { - // If we're on the same thread, then we're testing implicit syncing, so we - // don't need the actual fence code - if( fenceEvent != NULL ) - { - clReleaseEvent( fenceEvent ); - glDeleteSyncFunc( glFence ); - } + // If we're on the same thread, then we're testing implicit syncing, + // so we don't need the actual fence code + glDeleteSyncFunc(glFence); + - glFence = glFenceSyncFunc( GL_SYNC_GPU_COMMANDS_COMPLETE, 0 ); - fenceEvent = clCreateEventFromGLsyncKHR_ptr( context, glFence, &error ); - test_error( error, "Unable to create CL event from GL fence" ); + glFence = glFenceSyncFunc(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + fenceEvent = + clCreateEventFromGLsyncKHR_ptr(context, glFence, &error); + test_error(error, "Unable to create CL event from GL fence"); - // in case of explicit synchronization, we just wait for the sync object to complete - // in clEnqueueAcquireGLObject but we dont flush. Its application's responsibility - // to flush on the context on which glSync is created + // in case of explicit synchronization, we just wait for the sync + // object to complete in clEnqueueAcquireGLObject but we dont flush. + // Its application's responsibility to flush on the context on which + // glSync is created glFlush(); } else glFinish(); } - if( glFence != 0 ) - // Don't need the final release for fenceEvent, because the wrapper will take care of that - glDeleteSyncFunc( glFence ); + if (glFence != 0) + // Don't need the final release for fenceEvent, because the wrapper will + // take care of that + glDeleteSyncFunc(glFence); #ifdef __APPLE__ CGLSetVirtualScreen(CGLGetCurrentContext(), read_vs); @@ -585,54 +641,62 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ #endif #endif // Grab the contents of the final framebuffer - BufferOwningPtr<char> resultData( ReadGLRenderbuffer( glFramebuffer, glRenderbuffer, - GL_COLOR_ATTACHMENT0_EXT, - GL_RGBA, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar, - framebufferSize, 128 ) ); - - // Check the contents now. We should end up with solid color bands 32 pixels high and the - // full width of the framebuffer, at values (128,128,128) due to the additive blending - for( int i = 0; i < 8; i++ ) + BufferOwningPtr<char> resultData(ReadGLRenderbuffer( + glFramebuffer, glRenderbuffer, GL_COLOR_ATTACHMENT0_EXT, GL_RGBA, + GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, kUChar, framebufferSize, 128)); + + // Check the contents now. We should end up with solid color bands 32 pixels + // high and the full width of the framebuffer, at values (128,128,128) due + // to the additive blending + for (int i = 0; i < 8; i++) { - for( int y = 0; y < 4; y++ ) + for (int y = 0; y < 4; y++) { - // Note: coverage will be double because the 63-0 triangle overwrites again at the end of the pass - cl_uchar valA = ( ( ( i + 1 ) & 1 ) ) * numHorizVertices * 2 / framebufferSize; - cl_uchar valB = ( ( ( i + 1 ) & 2 ) >> 1 ) * numHorizVertices * 2 / framebufferSize; - cl_uchar valC = ( ( ( i + 1 ) & 4 ) >> 2 ) * numHorizVertices * 2 / framebufferSize; - - cl_uchar *row = (cl_uchar *)&resultData[ ( i * 16 + y ) * framebufferSize * 4 ]; - for( int x = 0; x < ( framebufferSize - 1 ) - 1; x++ ) + // Note: coverage will be double because the 63-0 triangle + // overwrites again at the end of the pass + cl_uchar valA = + (((i + 1) & 1)) * numHorizVertices * 2 / framebufferSize; + cl_uchar valB = + (((i + 1) & 2) >> 1) * numHorizVertices * 2 / framebufferSize; + cl_uchar valC = + (((i + 1) & 4) >> 2) * numHorizVertices * 2 / framebufferSize; + + cl_uchar *row = + (cl_uchar *)&resultData[(i * 16 + y) * framebufferSize * 4]; + for (int x = 0; x < (framebufferSize - 1) - 1; x++) { - if( ( row[ x * 4 ] != valA ) || ( row[ x * 4 + 1 ] != valB ) || - ( row[ x * 4 + 2 ] != valC ) ) + if ((row[x * 4] != valA) || (row[x * 4 + 1] != valB) + || (row[x * 4 + 2] != valC)) { - log_error( "ERROR: Output framebuffer did not validate!\n" ); - DumpGLBuffer( GL_UNSIGNED_BYTE, framebufferSize, 128, resultData ); - log_error( "RUNS:\n" ); + log_error("ERROR: Output framebuffer did not validate!\n"); + DumpGLBuffer(GL_UNSIGNED_BYTE, framebufferSize, 128, + resultData); + log_error("RUNS:\n"); uint32_t *p = (uint32_t *)(char *)resultData; size_t a = 0; - for( size_t t = 1; t < framebufferSize * framebufferSize; t++ ) + for (size_t t = 1; t < framebufferSize * framebufferSize; + t++) { - if( p[ a ] != 0 ) + if (p[a] != 0) { - if( p[ t ] == 0 ) + if (p[t] == 0) { - log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1, - (int)( a % framebufferSize ), (int)( a / framebufferSize ), - (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ), - p[ a ] ); + log_error( + "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", + a, t - 1, (int)(a % framebufferSize), + (int)(a / framebufferSize), + (int)((t - 1) % framebufferSize), + (int)((t - 1) / framebufferSize), p[a]); a = t; } } else { - if( p[ t ] != 0 ) + if (p[t] != 0) { a = t; } } - } return -1; } @@ -645,46 +709,56 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ return 0; } -int test_fence_sync( cl_device_id device, cl_context context, cl_command_queue queue, int numElements ) +int test_fence_sync(cl_device_id device, cl_context context, + cl_command_queue queue, int numElements) { GLint vs_count = 0; cl_device_id *device_list = NULL; - if( !is_extension_available( device, "cl_khr_gl_event" ) ) + if (!is_extension_available(device, "cl_khr_gl_event")) { - log_info( "NOTE: cl_khr_gl_event extension not present on this device; skipping fence sync test\n" ); + log_info("NOTE: cl_khr_gl_event extension not present on this device; " + "skipping fence sync test\n"); return 0; } #ifdef __APPLE__ CGLContextObj ctx = CGLGetCurrentContext(); CGLPixelFormatObj pix = CGLGetPixelFormat(ctx); - CGLError err = CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count); + CGLError err = + CGLDescribePixelFormat(pix, 0, kCGLPFAVirtualScreenCount, &vs_count); - device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*vs_count); - clGetGLContextInfoAPPLE(context, ctx, CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, sizeof(cl_device_id)*vs_count, device_list, NULL); + device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * vs_count); + clGetGLContextInfoAPPLE(context, ctx, + CL_CGL_DEVICES_FOR_SUPPORTED_VIRTUAL_SCREENS_APPLE, + sizeof(cl_device_id) * vs_count, device_list, NULL); #else - // Need platform specific way of getting devices from CL context to which OpenGL can render - // If not available it can be replaced with clGetContextInfo with CL_CONTEXT_DEVICES + // Need platform specific way of getting devices from CL context to which + // OpenGL can render If not available it can be replaced with + // clGetContextInfo with CL_CONTEXT_DEVICES size_t device_cb; - cl_int err = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb); - if( err != CL_SUCCESS ) + cl_int err = + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_cb); + if (err != CL_SUCCESS) { - print_error( err, "Unable to get device count from context" ); - return -1; + print_error(err, "Unable to get device count from context"); + return -1; } vs_count = (GLint)device_cb / sizeof(cl_device_id); - if (vs_count < 1) { - log_error("No devices found.\n"); - return -1; + if (vs_count < 1) + { + log_error("No devices found.\n"); + return -1; } - device_list = (cl_device_id *) malloc(device_cb); - err = clGetContextInfo( context, CL_CONTEXT_DEVICES, device_cb, device_list, NULL); - if( err != CL_SUCCESS ) { - free(device_list); - print_error( err, "Unable to get device list from context" ); - return -1; + device_list = (cl_device_id *)malloc(device_cb); + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, device_cb, device_list, + NULL); + if (err != CL_SUCCESS) + { + free(device_list); + print_error(err, "Unable to get device list from context"); + return -1; } #endif @@ -695,30 +769,38 @@ int test_fence_sync( cl_device_id device, cl_context context, cl_command_queue q // Loop through all the devices capable to OpenGL rendering // and set them as current rendering target - for(rend_vs = 0; rend_vs < vs_count; rend_vs++) + for (rend_vs = 0; rend_vs < vs_count; rend_vs++) { // Loop through all the devices and set them as current // compute target - for(read_vs = 0; read_vs < vs_count; read_vs++) + for (read_vs = 0; read_vs < vs_count; read_vs++) { - cl_device_id rend_device = device_list[rend_vs], read_device = device_list[read_vs]; + cl_device_id rend_device = device_list[rend_vs], + read_device = device_list[read_vs]; char rend_name[200], read_name[200]; - clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name), rend_name, NULL); - clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name), read_name, NULL); + clGetDeviceInfo(rend_device, CL_DEVICE_NAME, sizeof(rend_name), + rend_name, NULL); + clGetDeviceInfo(read_device, CL_DEVICE_NAME, sizeof(read_name), + read_name, NULL); - log_info("Rendering on: %s, read back on: %s\n", rend_name, read_name); - error = test_fence_sync_single( device, context, queue, false, rend_vs, read_vs, rend_device ); + log_info("Rendering on: %s, read back on: %s\n", rend_name, + read_name); + error = test_fence_sync_single(device, context, queue, false, + rend_vs, read_vs, rend_device); any_failed |= error; - if( error != 0 ) - log_error( "ERROR: Implicit syncing with GL sync events failed!\n\n" ); + if (error != 0) + log_error( + "ERROR: Implicit syncing with GL sync events failed!\n\n"); else log_info("Implicit syncing Passed\n"); - error = test_fence_sync_single( device, context, queue, true, rend_vs, read_vs, rend_device ); + error = test_fence_sync_single(device, context, queue, true, + rend_vs, read_vs, rend_device); any_failed |= error; - if( error != 0 ) - log_error( "ERROR: Explicit syncing with GL sync events failed!\n\n" ); + if (error != 0) + log_error( + "ERROR: Explicit syncing with GL sync events failed!\n\n"); else log_info("Explicit syncing Passed\n"); } diff --git a/test_conformance/gl/test_image_methods.cpp b/test_conformance/gl/test_image_methods.cpp index 07f5b65e..7d055fb2 100644 --- a/test_conformance/gl/test_image_methods.cpp +++ b/test_conformance/gl/test_image_methods.cpp @@ -337,7 +337,6 @@ int test_image_methods_depth( cl_device_id device, cl_context context, cl_comman return 0; } - size_t pixelSize; int result = 0; GLenum depth_targets[] = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY}; size_t ntargets = sizeof(depth_targets) / sizeof(depth_targets[0]); @@ -378,7 +377,6 @@ int test_image_methods_multisample( cl_device_id device, cl_context context, cl_ return 0; } - size_t pixelSize; int result = 0; GLenum targets[] = {GL_TEXTURE_2D_MULTISAMPLE, GL_TEXTURE_2D_MULTISAMPLE_ARRAY}; size_t ntargets = sizeof(targets) / sizeof(targets[0]); diff --git a/test_conformance/gl/test_images_getinfo_common.cpp b/test_conformance/gl/test_images_getinfo_common.cpp index 345b5950..2322c269 100644 --- a/test_conformance/gl/test_images_getinfo_common.cpp +++ b/test_conformance/gl/test_images_getinfo_common.cpp @@ -86,10 +86,11 @@ static int test_image_info( cl_context context, cl_command_queue queue, return CheckGLObjectInfo(streams[0], object_type, glTexture, glTarget, 0); } -static int test_image_format_get_info( - cl_context context, cl_command_queue queue, - size_t width, size_t height, size_t depth, - GLenum target, struct format* fmt, MTdata data) +static int test_image_format_get_info(cl_context context, + cl_command_queue queue, size_t width, + size_t height, size_t depth, + GLenum target, const format *fmt, + MTdata data) { int error = 0; @@ -197,9 +198,11 @@ static int test_image_format_get_info( &actualType, (void **)&outBuffer ); } -int test_images_get_info_common( cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes ) +int test_images_get_info_common(cl_device_id device, cl_context context, + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, + size_t ntargets, sizevec_t *sizes, + size_t nsizes) { int error = 0; RandomSeed seed(gRandomSeed); diff --git a/test_conformance/gl/test_images_read_common.cpp b/test_conformance/gl/test_images_read_common.cpp index 112c7891..fe2a529b 100644 --- a/test_conformance/gl/test_images_read_common.cpp +++ b/test_conformance/gl/test_images_read_common.cpp @@ -386,10 +386,9 @@ static int test_image_read( cl_context context, cl_command_queue queue, width, height, depth, sampleNum, outFormat, outType, outResultBuffer ); } -static int test_image_format_read( - cl_context context, cl_command_queue queue, - size_t width, size_t height, size_t depth, - GLenum target, struct format* fmt, MTdata data) +static int test_image_format_read(cl_context context, cl_command_queue queue, + size_t width, size_t height, size_t depth, + GLenum target, const format *fmt, MTdata data) { int error = 0; @@ -645,9 +644,10 @@ static int test_image_format_read( } } -int test_images_read_common( cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t *sizes, size_t nsizes ) +int test_images_read_common(cl_device_id device, cl_context context, + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, size_t ntargets, + sizevec_t *sizes, size_t nsizes) { int error = 0; RandomSeed seed(gRandomSeed); diff --git a/test_conformance/gl/test_images_write_common.cpp b/test_conformance/gl/test_images_write_common.cpp index 9bbb257b..0dba83bb 100644 --- a/test_conformance/gl/test_images_write_common.cpp +++ b/test_conformance/gl/test_images_write_common.cpp @@ -427,7 +427,6 @@ static int test_image_write( cl_context context, cl_command_queue queue, int supportsHalf(cl_context context, bool* supports_half) { int error; - size_t size; cl_uint numDev; error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL); @@ -446,7 +445,6 @@ int supportsHalf(cl_context context, bool* supports_half) int supportsMsaa(cl_context context, bool* supports_msaa) { int error; - size_t size; cl_uint numDev; error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL); @@ -465,7 +463,6 @@ int supportsMsaa(cl_context context, bool* supports_msaa) int supportsDepth(cl_context context, bool* supports_depth) { int error; - size_t size; cl_uint numDev; error = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDev, NULL); @@ -486,7 +483,6 @@ static int test_image_format_write( cl_context context, cl_command_queue queue, GLenum internalFormat, GLenum glType, ExplicitType type, MTdata d ) { int error; - int samples = 8; // If we're testing a half float format, then we need to determine the // rounding mode of this machine. Punt if we fail to do so. @@ -664,8 +660,9 @@ static int test_image_format_write( cl_context context, cl_command_queue queue, // combination. int test_images_write_common(cl_device_id device, cl_context context, - cl_command_queue queue, struct format* formats, size_t nformats, - GLenum *targets, size_t ntargets, sizevec_t* sizes, size_t nsizes ) + cl_command_queue queue, const format *formats, + size_t nformats, GLenum *targets, size_t ntargets, + sizevec_t *sizes, size_t nsizes) { int err = 0; int error = 0; diff --git a/test_conformance/gles/CMakeLists.txt b/test_conformance/gles/CMakeLists.txt index c76fe512..4f4ba532 100644 --- a/test_conformance/gles/CMakeLists.txt +++ b/test_conformance/gles/CMakeLists.txt @@ -18,3 +18,11 @@ set (${MODULE_NAME}_SOURCES list(APPEND CLConform_LIBRARIES EGL GLESv2) include(../CMakeCommon.txt) + +if(DEFINED USE_GLES3) + target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE GLES3) +endif() +if(MSVC) + # Don't warn about using the portable "strdup" function. + target_compile_definitions(${${MODULE_NAME}_OUT} PRIVATE _CRT_NONSTDC_NO_DEPRECATE) +endif()
\ No newline at end of file diff --git a/test_conformance/gles/main.cpp b/test_conformance/gles/main.cpp index 644fa63c..60e020d8 100644 --- a/test_conformance/gles/main.cpp +++ b/test_conformance/gles/main.cpp @@ -320,8 +320,10 @@ int main(int argc, const char *argv[]) goto cleanup; } +#ifdef GLES3 int argc_ = (first_32_testname) ? 1 + (argc - first_32_testname) : argc; const char** argv_ = (first_32_testname) ? &argv[first_32_testname-1] : argv; +#endif // Execute the tests. for( size_t i = 0; i < numDevices; i++ ) { diff --git a/test_conformance/gles/setup_egl.cpp b/test_conformance/gles/setup_egl.cpp index fe0f8ca3..95a12a66 100644 --- a/test_conformance/gles/setup_egl.cpp +++ b/test_conformance/gles/setup_egl.cpp @@ -117,7 +117,8 @@ public: _platform, "clGetGLContextInfoKHR"); if (GetGLContextInfo == NULL) { - print_error(status, "clGetGLContextInfoKHR failed"); + log_error("ERROR: clGetGLContextInfoKHR failed! (%s:%d)\n", + __FILE__, __LINE__); return NULL; } @@ -128,7 +129,7 @@ public: return NULL; } dev_size /= sizeof(cl_device_id); - log_info("GL _context supports %d compute devices\n", dev_size); + log_info("GL _context supports %zu compute devices\n", dev_size); status = GetGLContextInfo(properties, CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR, diff --git a/test_conformance/gles/test_buffers.cpp b/test_conformance/gles/test_buffers.cpp index a2d67322..73711261 100644 --- a/test_conformance/gles/test_buffers.cpp +++ b/test_conformance/gles/test_buffers.cpp @@ -205,10 +205,10 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType if (validate_only) { int result = (CheckGLObjectInfo(streams[0], CL_GL_OBJECT_BUFFER, (GLuint)inGLBuffer, (GLenum)0, 0) | CheckGLObjectInfo(streams[2], CL_GL_OBJECT_BUFFER, (GLuint)outGLBuffer, (GLenum)0, 0) ); - for(i=0;i<3;i++) + + for (i = 0; i < 3; i++) { - clReleaseMemObject(streams[i]); - streams[i] = NULL; + streams[i].reset(); } glDeleteBuffers(1, &inGLBuffer); inGLBuffer = 0; @@ -285,10 +285,9 @@ int test_buffer_kernel(cl_context context, cl_command_queue queue, ExplicitType clP += get_explicit_type_size( vecType ); } - for(i=0;i<3;i++) + for (i = 0; i < 3; i++) { - clReleaseMemObject(streams[i]); - streams[i] = NULL; + streams[i].reset(); } glDeleteBuffers(1, &inGLBuffer); inGLBuffer = 0; diff --git a/test_conformance/gles/test_fence_sync.cpp b/test_conformance/gles/test_fence_sync.cpp index 0af91a46..968d9695 100644 --- a/test_conformance/gles/test_fence_sync.cpp +++ b/test_conformance/gles/test_fence_sync.cpp @@ -570,10 +570,12 @@ int test_fence_sync_single( cl_device_id device, cl_context context, cl_command_ { if( p[ t ] == 0 ) { - log_error( "RUN: %ld to %ld (%d,%d to %d,%d) 0x%08x\n", a, t - 1, - (int)( a % framebufferSize ), (int)( a / framebufferSize ), - (int)( ( t - 1 ) % framebufferSize ), (int)( ( t - 1 ) / framebufferSize ), - p[ a ] ); + log_error( + "RUN: %zu to %zu (%d,%d to %d,%d) 0x%08x\n", + a, t - 1, (int)(a % framebufferSize), + (int)(a / framebufferSize), + (int)((t - 1) % framebufferSize), + (int)((t - 1) / framebufferSize), p[a]); a = t; } } diff --git a/test_conformance/gles/test_images_2D.cpp b/test_conformance/gles/test_images_2D.cpp index c1a17fc8..f6554023 100644 --- a/test_conformance/gles/test_images_2D.cpp +++ b/test_conformance/gles/test_images_2D.cpp @@ -369,7 +369,9 @@ int test_images_read_cube( cl_device_id device, cl_context context, cl_command_q } +#ifdef __APPLE__ #pragma mark -------------------- Write tests ------------------------- +#endif int test_cl_image_write( cl_context context, cl_command_queue queue, cl_mem clImage, diff --git a/test_conformance/gles/test_renderbuffer.cpp b/test_conformance/gles/test_renderbuffer.cpp index 20127aca..0f6d289b 100644 --- a/test_conformance/gles/test_renderbuffer.cpp +++ b/test_conformance/gles/test_renderbuffer.cpp @@ -197,7 +197,9 @@ int test_renderbuffer_read( cl_device_id device, cl_context context, cl_command_ } +#ifdef __APPLE__ #pragma mark -------------------- Write tests ------------------------- +#endif int test_attach_renderbuffer_write_to_image( cl_context context, cl_command_queue queue, GLenum glTarget, GLuint glRenderbuffer, size_t imageWidth, size_t imageHeight, cl_image_format *outFormat, ExplicitType *outType, MTdata d, void **outSourceBuffer ) diff --git a/test_conformance/half/Test_roundTrip.cpp b/test_conformance/half/Test_roundTrip.cpp index 69fc7e41..1ab40937 100644 --- a/test_conformance/half/Test_roundTrip.cpp +++ b/test_conformance/half/Test_roundTrip.cpp @@ -14,6 +14,9 @@ // limitations under the License. // #include <string.h> + +#include <algorithm> + #include "cl_utils.h" #include "tests.h" #include "harness/testHarness.h" @@ -156,7 +159,7 @@ int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue qu } // Figure out how many elements are in a work block - size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float)); + size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float)); size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half size_t stride = blockCount; @@ -168,7 +171,7 @@ int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue qu for( i = 0; i < (uint64_t)lastCase; i += stride ) { - count = (uint32_t) MIN( blockCount, lastCase - i ); + count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i); //Init the input stream uint16_t *p = (uint16_t *)gIn_half; diff --git a/test_conformance/half/Test_vLoadHalf.cpp b/test_conformance/half/Test_vLoadHalf.cpp index 52867c25..e9354019 100644 --- a/test_conformance/half/Test_vLoadHalf.cpp +++ b/test_conformance/half/Test_vLoadHalf.cpp @@ -17,6 +17,9 @@ #include "harness/testHarness.h" #include <string.h> + +#include <algorithm> + #include "cl_utils.h" #include "tests.h" @@ -37,14 +40,12 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned ) const char *vector_size_names[] = {"1", "2", "4", "8", "16", "3"}; int minVectorSize = kMinVectorSize; - // There is no aligned scalar vloada_half in CL 1.1 -#if ! defined( CL_VERSION_1_1 ) && ! defined(__APPLE__) - vlog("Note: testing vloada_half.\n"); - if (aligned && minVectorSize == 0) - minVectorSize = 1; -#endif - for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++) + // There is no aligned scalar vloada_half + if (aligned && minVectorSize == 0) minVectorSize = 1; + + for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; + vectorSize++) { int effectiveVectorSize = g_arrVecSizes[vectorSize]; @@ -81,7 +82,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned ) "{\n" " size_t i = get_global_id(0);\n" " f[i] = vloada_half3( i, p );\n" - " ((__global float *)f)[4*i+3] = vloada_half(4*i+3,p);\n" + " ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n" "}\n" }; @@ -431,7 +432,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned ) } // Figure out how many elements are in a work block - size_t elementSize = MAX( sizeof(cl_half), sizeof(cl_float)); + size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float)); size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2 uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half @@ -449,7 +450,7 @@ int Test_vLoadHalf_private( cl_device_id device, bool aligned ) for( i = 0; i < (uint64_t)lastCase; i += blockCount ) { - count = (uint32_t) MIN( blockCount, lastCase - i ); + count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i); //Init the input stream uint16_t *p = (uint16_t *)gIn_half; diff --git a/test_conformance/half/Test_vStoreHalf.cpp b/test_conformance/half/Test_vStoreHalf.cpp index c3a328ad..591470f0 100644 --- a/test_conformance/half/Test_vStoreHalf.cpp +++ b/test_conformance/half/Test_vStoreHalf.cpp @@ -18,6 +18,9 @@ #include "harness/testHarness.h" #include <string.h> + +#include <algorithm> + #include "cl_utils.h" #include "tests.h" @@ -78,7 +81,7 @@ ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) cl_ushort *r = cri->r + off; f2h f = cri->f; cl_ulong i = cri->i + off; - cl_uint j, rr; + cl_uint j; if (off + count > lim) count = lim - off; @@ -114,8 +117,7 @@ CheckF(cl_uint jid, cl_uint tid, void *userInfo) return 0; for (j = 0; j < count; j++) { - if (s[j] == r[j]) - continue; + if (s[j] == r[j]) continue; // Pass any NaNs if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00 ) @@ -186,8 +188,7 @@ CheckD(cl_uint jid, cl_uint tid, void *userInfo) return 0; for (j = 0; j < count; j++) { - if (s[j] == r[j]) - continue; + if (s[j] == r[j]) continue; // Pass any NaNs if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) @@ -419,7 +420,9 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR "__kernel void test( __global float *p, __global half *f,\n" " uint extra_last_thread )\n" "{\n" - " __local ushort data[3*(", local_buf_size, "+1)];\n" + " __local ushort data[3*(", + local_buf_size, + "+1)];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " size_t last_i = get_global_size(0)-1;\n" @@ -429,9 +432,18 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " } " - " vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" + " vstore_half3", + roundName, + "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" - " async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later + " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n" + " extra_last_thread != 0) {\n" + " adjust = 3-extra_last_thread;\n" + " }\n" + " async_event = async_work_group_copy(\n" + " (__global ushort*)(f+3*(i-lid)),\n" + " (__local ushort *)(&data[adjust]),\n" + " lsize*3-adjust, 0);\n" // investigate later " wait_group_events(1, &async_event);\n" "}\n" }; @@ -521,7 +533,9 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR "__kernel void test( __global double *p, __global half *f,\n" " uint extra_last_thread )\n" "{\n" - " __local ushort data[3*(", local_buf_size, "+1)];\n" + " __local ushort data[3*(", + local_buf_size, + "+1)];\n" " size_t i = get_global_id(0);\n" " size_t lid = get_local_id(0);\n" " size_t last_i = get_global_size(0)-1;\n" @@ -531,15 +545,23 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR " if(last_i == i && extra_last_thread != 0) {\n" " adjust = 3-extra_last_thread;\n" " }\n " - " vstore_half3",roundName,"( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" + " vstore_half3", + roundName, + "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n" " barrier( CLK_LOCAL_MEM_FENCE ); \n" - " async_event = async_work_group_copy((__global ushort *)(f+3*(i-lid)), (__local ushort *)(&data[adjust]), lsize*3-adjust, 0);\n" // investigate later + " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n" + " extra_last_thread != 0) {\n" + " adjust = 3-extra_last_thread;\n" + " }\n" + " async_event = async_work_group_copy(\n" + " (__global ushort *)(f+3*(i-lid)),\n" + " (__local ushort *)(&data[adjust]),\n" + " lsize*3-adjust, 0);\n" // investigate later " wait_group_events(1, &async_event);\n" "}\n" }; - if(g_arrVecSizes[vectorSize] == 3) { programs[vectorSize][0] = MakeProgram( device, source_v3, sizeof(source_v3) / sizeof( source_v3[0]) ); } else { @@ -674,7 +696,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR } // end for vector size // Figure out how many elements are in a work block - size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float)); + size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2 uint64_t lastCase = 1ULL << (8*sizeof(float)); // number of floats. size_t stride = blockCount; @@ -726,7 +748,7 @@ int Test_vStoreHalf_private( cl_device_id device, f2h referenceFunc, d2h doubleR for( i = 0; i < lastCase; i += stride ) { - count = (cl_uint) MIN( blockCount, lastCase - i ); + count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); fref.i = i; dref.i = i; @@ -1272,7 +1294,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double } // Figure out how many elements are in a work block - size_t elementSize = MAX( sizeof(cl_ushort), sizeof(float)); + size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float)); size_t blockCount = BUFFER_SIZE / elementSize; uint64_t lastCase = 1ULL << (8*sizeof(float)); size_t stride = blockCount; @@ -1323,7 +1345,7 @@ int Test_vStoreaHalf_private( cl_device_id device, f2h referenceFunc, d2h double for( i = 0; i < (uint64_t)lastCase; i += stride ) { - count = (cl_uint) MIN( blockCount, lastCase - i ); + count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i); fref.i = i; dref.i = i; diff --git a/test_conformance/half/main.cpp b/test_conformance/half/main.cpp index 6600cc58..6bc7db95 100644 --- a/test_conformance/half/main.cpp +++ b/test_conformance/half/main.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -131,8 +131,7 @@ exit: static int ParseArgs( int argc, const char **argv ) { int i; - argList = (const char **)calloc( argc - 1, sizeof( char*) ); - + argList = (const char **)calloc(argc, sizeof(char *)); if( NULL == argList ) { vlog_error( "Failed to allocate memory for argList.\n" ); @@ -222,7 +221,6 @@ static int ParseArgs( int argc, const char **argv ) gWimpyMode = 1; } - vlog( "Test binary built %s %s\n", __DATE__, __TIME__ ); PrintArch(); if( gWimpyMode ) { @@ -248,4 +246,3 @@ static void PrintUsage( void ) vlog("\t\t%s\n", test_list[i].name ); } } - diff --git a/test_conformance/images/clCopyImage/test_copy_1D.cpp b/test_conformance/images/clCopyImage/test_copy_1D.cpp index 2c996c72..0f6f3ce4 100644 --- a/test_conformance/images/clCopyImage/test_copy_1D.cpp +++ b/test_conformance/images/clCopyImage/test_copy_1D.cpp @@ -113,6 +113,7 @@ int test_copy_image_set_1D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_1D_array.cpp b/test_conformance/images/clCopyImage/test_copy_1D_array.cpp index 0b616934..f0b610bb 100644 --- a/test_conformance/images/clCopyImage/test_copy_1D_array.cpp +++ b/test_conformance/images/clCopyImage/test_copy_1D_array.cpp @@ -118,6 +118,7 @@ int test_copy_image_set_1D_array( cl_device_id device, cl_context context, cl_co if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_2D.cpp b/test_conformance/images/clCopyImage/test_copy_2D.cpp index 1a69a1fe..448b47f0 100644 --- a/test_conformance/images/clCopyImage/test_copy_2D.cpp +++ b/test_conformance/images/clCopyImage/test_copy_2D.cpp @@ -125,6 +125,7 @@ int test_copy_image_set_2D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp index eb6dd552..1819d87c 100644 --- a/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp +++ b/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp @@ -224,6 +224,7 @@ int test_copy_image_set_2D_2D_array( cl_device_id device, cl_context context, cl if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp b/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp index 8a56c95f..4ab6b42a 100644 --- a/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp +++ b/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp @@ -230,6 +230,7 @@ int test_copy_image_set_2D_3D( cl_device_id device, cl_context context, cl_comma if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_2D_array.cpp index 6327ba58..3376bf9a 100644 --- a/test_conformance/images/clCopyImage/test_copy_2D_array.cpp +++ b/test_conformance/images/clCopyImage/test_copy_2D_array.cpp @@ -71,6 +71,7 @@ int test_copy_image_set_2D_array( cl_device_id device, cl_context context, cl_co if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_3D.cpp b/test_conformance/images/clCopyImage/test_copy_3D.cpp index da6731d7..cdfdccec 100644 --- a/test_conformance/images/clCopyImage/test_copy_3D.cpp +++ b/test_conformance/images/clCopyImage/test_copy_3D.cpp @@ -57,6 +57,7 @@ int test_copy_image_set_3D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp b/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp index c098f645..1da1e477 100644 --- a/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp +++ b/test_conformance/images/clCopyImage/test_copy_3D_2D_array.cpp @@ -251,6 +251,7 @@ int test_copy_image_set_3D_2D_array(cl_device_id device, cl_context context, cl_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clCopyImage/test_copy_generic.cpp b/test_conformance/images/clCopyImage/test_copy_generic.cpp index 026916e8..3bd1b6ef 100644 --- a/test_conformance/images/clCopyImage/test_copy_generic.cpp +++ b/test_conformance/images/clCopyImage/test_copy_generic.cpp @@ -228,6 +228,11 @@ cl_mem create_image( cl_context context, cl_command_queue queue, BufferOwningPtr } size_t mappedSlicePad = mappedSlice - (mappedRow * height); + // For 1Darray, the height variable actually contains the arraysize, + // so it can't be used for calculating the slice padding. + if (imageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + mappedSlicePad = mappedSlice - (mappedRow * 1); + // Copy the image. size_t scanlineSize = row_pitch_lod; size_t sliceSize = slice_pitch_lod - scanlineSize * height; @@ -547,18 +552,19 @@ int test_copy_image_generic( cl_context context, cl_command_queue queue, image_d { if( memcmp( sourcePtr, destPtr, scanlineSize ) != 0 ) { - // Find the first missing pixel + // Find the first differing pixel size_t pixel_size = get_pixel_size( dstImageInfo->format ); - size_t where = 0; - for( where = 0; where < dstImageInfo->width; where++ ) - if( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) ) - break; - - print_first_pixel_difference_error( - where, sourcePtr + pixel_size * where, - destPtr + pixel_size * where, dstImageInfo, y, - dstImageInfo->depth); - return -1; + size_t where = + compare_scanlines(dstImageInfo, sourcePtr, destPtr); + + if (where < dstImageInfo->width) + { + print_first_pixel_difference_error( + where, sourcePtr + pixel_size * where, + destPtr + pixel_size * where, dstImageInfo, y, + dstImageInfo->depth); + return -1; + } } sourcePtr += rowPitch; if((dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D_ARRAY || dstImageInfo->type == CL_MEM_OBJECT_IMAGE1D)) diff --git a/test_conformance/images/clFillImage/test_fill_1D.cpp b/test_conformance/images/clFillImage/test_fill_1D.cpp index c3f23185..b1550bf3 100644 --- a/test_conformance/images/clFillImage/test_fill_1D.cpp +++ b/test_conformance/images/clFillImage/test_fill_1D.cpp @@ -80,6 +80,7 @@ int test_fill_image_set_1D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if ( gTestSmallImages ) diff --git a/test_conformance/images/clFillImage/test_fill_1D_array.cpp b/test_conformance/images/clFillImage/test_fill_1D_array.cpp index b4347a47..be32ec6a 100644 --- a/test_conformance/images/clFillImage/test_fill_1D_array.cpp +++ b/test_conformance/images/clFillImage/test_fill_1D_array.cpp @@ -83,6 +83,7 @@ int test_fill_image_set_1D_array( cl_device_id device, cl_context context, cl_co if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if ( gTestSmallImages ) diff --git a/test_conformance/images/clFillImage/test_fill_2D.cpp b/test_conformance/images/clFillImage/test_fill_2D.cpp index bb66fc27..e941abcf 100644 --- a/test_conformance/images/clFillImage/test_fill_2D.cpp +++ b/test_conformance/images/clFillImage/test_fill_2D.cpp @@ -83,6 +83,7 @@ int test_fill_image_set_2D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if ( gTestSmallImages ) diff --git a/test_conformance/images/clFillImage/test_fill_2D_array.cpp b/test_conformance/images/clFillImage/test_fill_2D_array.cpp index 3265aab0..38196cfc 100644 --- a/test_conformance/images/clFillImage/test_fill_2D_array.cpp +++ b/test_conformance/images/clFillImage/test_fill_2D_array.cpp @@ -87,6 +87,7 @@ int test_fill_image_set_2D_array( cl_device_id device, cl_context context, cl_co if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if ( gTestSmallImages ) diff --git a/test_conformance/images/clFillImage/test_fill_3D.cpp b/test_conformance/images/clFillImage/test_fill_3D.cpp index 9db0ac7c..0b8e4e58 100644 --- a/test_conformance/images/clFillImage/test_fill_3D.cpp +++ b/test_conformance/images/clFillImage/test_fill_3D.cpp @@ -87,6 +87,7 @@ int test_fill_image_set_3D( cl_device_id device, cl_context context, cl_command_ if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if ( gTestSmallImages ) diff --git a/test_conformance/images/clFillImage/test_fill_generic.cpp b/test_conformance/images/clFillImage/test_fill_generic.cpp index 59bf24ad..6cd6beb0 100644 --- a/test_conformance/images/clFillImage/test_fill_generic.cpp +++ b/test_conformance/images/clFillImage/test_fill_generic.cpp @@ -468,27 +468,19 @@ int test_fill_image_generic( cl_context context, cl_command_queue queue, image_d { for ( size_t y = 0; y < secondDim; y++ ) { - // If the data type is 101010 ignore bits 31 and 32 when comparing the row - if (imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010) { - for (size_t w=0;w!=scanlineSize/4;++w) { - ((cl_uint*)sourcePtr)[w] &= 0x3FFFFFFF; - ((cl_uint*)destPtr)[w] &= 0x3FFFFFFF; - } - } - if (memcmp( sourcePtr, destPtr, scanlineSize ) != 0) { - // Find the first missing pixel + // Find the first differing pixel size_t pixel_size = get_pixel_size( imageInfo->format ); - size_t where = 0; - for ( where = 0; where < imageInfo->width; where++ ) - if ( memcmp( sourcePtr + pixel_size * where, destPtr + pixel_size * where, pixel_size) ) - break; - - print_first_pixel_difference_error( - where, sourcePtr + pixel_size * where, - destPtr + pixel_size * where, imageInfo, y, thirdDim); - return -1; + size_t where = compare_scanlines(imageInfo, sourcePtr, destPtr); + + if (where < imageInfo->width) + { + print_first_pixel_difference_error( + where, sourcePtr + pixel_size * where, + destPtr + pixel_size * where, imageInfo, y, thirdDim); + return -1; + } } total_matched += scanlineSize; diff --git a/test_conformance/images/clGetInfo/test_1D.cpp b/test_conformance/images/clGetInfo/test_1D.cpp index 0d704b82..7e044856 100644 --- a/test_conformance/images/clGetInfo/test_1D.cpp +++ b/test_conformance/images/clGetInfo/test_1D.cpp @@ -46,6 +46,7 @@ int test_get_image_info_1D( cl_device_id device, cl_context context, cl_image_fo if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clGetInfo/test_1D_2D_array.cpp b/test_conformance/images/clGetInfo/test_1D_2D_array.cpp index 447fc7c2..c35bf22b 100644 --- a/test_conformance/images/clGetInfo/test_1D_2D_array.cpp +++ b/test_conformance/images/clGetInfo/test_1D_2D_array.cpp @@ -44,6 +44,7 @@ int test_get_image_info_1D_array( cl_device_id device, cl_context context, cl_im if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) @@ -168,6 +169,7 @@ int test_get_image_info_2D_array( cl_device_id device, cl_context context, cl_im if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clGetInfo/test_2D.cpp b/test_conformance/images/clGetInfo/test_2D.cpp index 74a60123..764b186d 100644 --- a/test_conformance/images/clGetInfo/test_2D.cpp +++ b/test_conformance/images/clGetInfo/test_2D.cpp @@ -285,6 +285,7 @@ int test_get_image_info_2D( cl_device_id device, cl_context context, cl_image_fo if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clGetInfo/test_3D.cpp b/test_conformance/images/clGetInfo/test_3D.cpp index af5062e3..e1261863 100644 --- a/test_conformance/images/clGetInfo/test_3D.cpp +++ b/test_conformance/images/clGetInfo/test_3D.cpp @@ -47,6 +47,7 @@ int test_get_image_info_3D( cl_device_id device, cl_context context, cl_image_fo if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clReadWriteImage/test_read_1D.cpp b/test_conformance/images/clReadWriteImage/test_read_1D.cpp index eef5bf4e..2d94dc82 100644 --- a/test_conformance/images/clReadWriteImage/test_read_1D.cpp +++ b/test_conformance/images/clReadWriteImage/test_read_1D.cpp @@ -81,7 +81,6 @@ int test_read_image_1D(cl_context context, cl_command_queue queue, for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - float lod_float = (float) lod; origin[1] = lod; size_t width_lod, row_pitch_lod; @@ -90,14 +89,17 @@ int test_read_image_1D(cl_context context, cl_command_queue queue, region[0] = width_lod; - if ( gDebugTrace ) - if ( gTestMipmaps) { - log_info(" - Working at mipLevel :%llu\n", (unsigned long long)lod); - } - error = clEnqueueWriteImage(queue, image, CL_FALSE, - origin, region, ( gEnablePitch ? row_pitch_lod : 0 ), 0, - (char*)imageValues + imgValMipLevelOffset, 0, NULL, NULL); - if (error != CL_SUCCESS) { + if (gDebugTrace) + if (gTestMipmaps) + { + log_info(" - Working at mipLevel :%llu\n", (unsigned long long)lod); + } + error = clEnqueueWriteImage(queue, image, CL_FALSE, origin, region, + (gEnablePitch ? row_pitch_lod : 0), 0, + (char *)imageValues + imgValMipLevelOffset, 0, + NULL, NULL); + if (error != CL_SUCCESS) + { log_error( "ERROR: Unable to write to 1D image of size %d \n", (int)width_lod ); return -1; } @@ -185,6 +187,7 @@ int test_read_image_set_1D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp index 5d5c2883..cc902042 100644 --- a/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp +++ b/test_conformance/images/clReadWriteImage/test_read_1D_array.cpp @@ -82,7 +82,6 @@ int test_read_image_1D_array(cl_context context, cl_command_queue queue, for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - float lod_float = (float) lod; size_t width_lod, row_pitch_lod, slice_pitch_lod; if( gTestMipmaps ) origin[2] = lod; @@ -192,6 +191,7 @@ int test_read_image_set_1D_array(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clReadWriteImage/test_read_2D.cpp b/test_conformance/images/clReadWriteImage/test_read_2D.cpp index fb2e7948..b6102874 100644 --- a/test_conformance/images/clReadWriteImage/test_read_2D.cpp +++ b/test_conformance/images/clReadWriteImage/test_read_2D.cpp @@ -81,7 +81,6 @@ int test_read_image_2D(cl_context context, cl_command_queue queue, for( size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - float lod_float = (float) lod; origin[2] = lod; size_t width_lod, height_lod, row_pitch_lod; @@ -195,6 +194,7 @@ int test_read_image_set_2D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp index d0113bb7..401b0e4d 100644 --- a/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp +++ b/test_conformance/images/clReadWriteImage/test_read_2D_array.cpp @@ -83,9 +83,8 @@ int test_read_image_2D_array(cl_context context, cl_command_queue queue, for(size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - float lod_float = (float) lod; origin[3] = lod; - size_t width_lod, height_lod, depth_lod, row_pitch_lod, slice_pitch_lod; + size_t width_lod, height_lod, row_pitch_lod, slice_pitch_lod; width_lod = (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; height_lod = (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; @@ -170,6 +169,7 @@ int test_read_image_set_2D_array(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/clReadWriteImage/test_read_3D.cpp b/test_conformance/images/clReadWriteImage/test_read_3D.cpp index 2dcd2433..ced04abf 100644 --- a/test_conformance/images/clReadWriteImage/test_read_3D.cpp +++ b/test_conformance/images/clReadWriteImage/test_read_3D.cpp @@ -83,7 +83,6 @@ int test_read_image_3D(cl_context context, cl_command_queue queue, for(size_t lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - float lod_float = (float) lod; origin[3] = lod; size_t width_lod, height_lod, depth_lod, row_pitch_lod, slice_pitch_lod; @@ -175,6 +174,7 @@ int test_read_image_set_3D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/kernel_image_methods/test_1D.cpp b/test_conformance/images/kernel_image_methods/test_1D.cpp index 0059d4c2..934e78ba 100644 --- a/test_conformance/images/kernel_image_methods/test_1D.cpp +++ b/test_conformance/images/kernel_image_methods/test_1D.cpp @@ -171,6 +171,7 @@ int test_get_image_info_1D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/kernel_image_methods/test_1D_array.cpp b/test_conformance/images/kernel_image_methods/test_1D_array.cpp index 797161c4..a824f088 100644 --- a/test_conformance/images/kernel_image_methods/test_1D_array.cpp +++ b/test_conformance/images/kernel_image_methods/test_1D_array.cpp @@ -181,6 +181,7 @@ int test_get_image_info_1D_array(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/kernel_image_methods/test_2D.cpp b/test_conformance/images/kernel_image_methods/test_2D.cpp index b0d4a708..07f8d929 100644 --- a/test_conformance/images/kernel_image_methods/test_2D.cpp +++ b/test_conformance/images/kernel_image_methods/test_2D.cpp @@ -232,6 +232,7 @@ int test_get_image_info_2D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } if( gTestSmallImages ) diff --git a/test_conformance/images/kernel_read_write/CMakeLists.txt b/test_conformance/images/kernel_read_write/CMakeLists.txt index 595f024a..ccd678c1 100644 --- a/test_conformance/images/kernel_read_write/CMakeLists.txt +++ b/test_conformance/images/kernel_read_write/CMakeLists.txt @@ -14,8 +14,14 @@ set(${MODULE_NAME}_SOURCES test_write_1D_array.cpp test_write_2D_array.cpp test_write_3D.cpp + test_cl_ext_image_requirements_info.cpp + test_cl_ext_image_from_buffer.cpp ../common.cpp ) +# Make unused variables not fatal in this module; see +# https://github.com/KhronosGroup/OpenCL-CTS/issues/1484 +set_gnulike_module_compile_flags("-Wno-error=unused-variable") + include(../../CMakeCommon.txt) diff --git a/test_conformance/images/kernel_read_write/main.cpp b/test_conformance/images/kernel_read_write/main.cpp index 31dceb33..0a93a974 100644 --- a/test_conformance/images/kernel_read_write/main.cpp +++ b/test_conformance/images/kernel_read_write/main.cpp @@ -53,6 +53,43 @@ static void printUsage( const char *execName ); extern int test_image_set( cl_device_id device, cl_context context, cl_command_queue queue, test_format_set_fn formatTestFn, cl_mem_object_type imageType ); +extern int cl_image_requirements_size_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int cl_image_requirements_size_ext_consistency(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int clGetImageRequirementsInfoEXT_negative(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int cl_image_requirements_max_val_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int cl_image_requirements_max_val_ext_positive(cl_device_id device, + cl_context context, + cl_command_queue queue); + +extern int image2d_from_buffer_positive(cl_device_id device, cl_context context, + cl_command_queue queue); +extern int memInfo_image_from_buffer_positive(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int imageInfo_image_from_buffer_positive(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int image_from_buffer_alignment_negative(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int image_from_small_buffer_negative(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int image_from_buffer_fill_positive(cl_device_id device, + cl_context context, + cl_command_queue queue); +extern int image_from_buffer_read_positive(cl_device_id device, + cl_context context, + cl_command_queue queue); + /** read_write images only support sampler-less read buildt-ins which require special settings * for some global parameters. This pair of functions temporarily overwrite those global parameters * and then recover them after completing a read_write test. @@ -246,12 +283,108 @@ int test_2Darray(cl_device_id device, cl_context context, cl_command_queue queue return doTest( device, context, queue, CL_MEM_OBJECT_IMAGE2D_ARRAY ); } +int test_cl_image_requirements_size_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return cl_image_requirements_size_ext_negative(device, context, queue); +} +int test_cl_image_requirements_size_ext_consistency(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return cl_image_requirements_size_ext_consistency(device, context, queue); +} +int test_clGetImageRequirementsInfoEXT_negative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return clGetImageRequirementsInfoEXT_negative(device, context, queue); +} +int test_cl_image_requirements_max_val_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return cl_image_requirements_max_val_ext_negative(device, context, queue); +} +int test_cl_image_requirements_max_val_ext_positive(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return cl_image_requirements_max_val_ext_positive(device, context, queue); +} + +int test_image2d_from_buffer_positive(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return image2d_from_buffer_positive(device, context, queue); +} +int test_memInfo_image_from_buffer_positive(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return memInfo_image_from_buffer_positive(device, context, queue); +} +int test_imageInfo_image_from_buffer_positive(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return imageInfo_image_from_buffer_positive(device, context, queue); +} +int test_image_from_buffer_alignment_negative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return image_from_buffer_alignment_negative(device, context, queue); +} +int test_image_from_small_buffer_negative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return image_from_small_buffer_negative(device, context, queue); +} +int test_image_from_buffer_fill_positive(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return image_from_buffer_fill_positive(device, context, queue); +} +int test_image_from_buffer_read_positive(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + return image_from_buffer_read_positive(device, context, queue); +} + test_definition test_list[] = { - ADD_TEST( 1D ), - ADD_TEST( 2D ), - ADD_TEST( 3D ), - ADD_TEST( 1Darray ), - ADD_TEST( 2Darray ), + ADD_TEST(1D), + ADD_TEST(2D), + ADD_TEST(3D), + ADD_TEST(1Darray), + ADD_TEST(2Darray), + ADD_TEST_VERSION(cl_image_requirements_size_ext_negative, Version(3, 0)), + ADD_TEST_VERSION(cl_image_requirements_size_ext_consistency, Version(3, 0)), + ADD_TEST_VERSION(clGetImageRequirementsInfoEXT_negative, Version(3, 0)), + ADD_TEST_VERSION(cl_image_requirements_max_val_ext_negative, Version(3, 0)), + ADD_TEST_VERSION(cl_image_requirements_max_val_ext_positive, Version(3, 0)), + ADD_TEST_VERSION(image2d_from_buffer_positive, Version(3, 0)), + ADD_TEST_VERSION(memInfo_image_from_buffer_positive, Version(3, 0)), + ADD_TEST_VERSION(imageInfo_image_from_buffer_positive, Version(3, 0)), + ADD_TEST_VERSION(image_from_buffer_alignment_negative, Version(3, 0)), + ADD_TEST_VERSION(image_from_small_buffer_negative, Version(3, 0)), + ADD_TEST_VERSION(image_from_buffer_fill_positive, Version(3, 0)), + ADD_TEST_VERSION(image_from_buffer_read_positive, Version(3, 0)), }; const int test_num = ARRAY_SIZE( test_list ); diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp new file mode 100644 index 00000000..c6646330 --- /dev/null +++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp @@ -0,0 +1,124 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _TEST_CL_EXT_IMAGE_BUFFER +#define _TEST_CL_EXT_IMAGE_BUFFER + +#define TEST_IMAGE_SIZE 20 + +#define GET_EXTENSION_FUNC(platform, function_name) \ + function_name##_fn function_name = reinterpret_cast<function_name##_fn>( \ + clGetExtensionFunctionAddressForPlatform(platform, #function_name)); \ + if (function_name == nullptr) \ + { \ + return TEST_FAIL; \ + } \ + do \ + { \ + } while (false) + +static inline size_t aligned_size(size_t size, size_t alignment) +{ + return (size + alignment - 1) & ~(alignment - 1); +} + +static inline void* aligned_ptr(void* ptr, size_t alignment) +{ + return (void*)(((uintptr_t)ptr + alignment - 1) & ~(alignment - 1)); +} + +static inline size_t get_format_size(cl_context context, + cl_image_format* format, + cl_mem_object_type imageType, + cl_mem_flags flags) +{ + cl_image_desc image_desc = { 0 }; + image_desc.image_type = imageType; + + /* Size 1 only to query element size */ + image_desc.image_width = 1; + if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType + && CL_MEM_OBJECT_IMAGE1D != imageType) + { + image_desc.image_height = 1; + } + if (CL_MEM_OBJECT_IMAGE3D == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + image_desc.image_depth = 1; + } + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + image_desc.image_array_size = 1; + } + + cl_int error = 0; + cl_mem buffer; + if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER) + { + buffer = clCreateBuffer(context, flags, + get_pixel_size(format) * image_desc.image_width, + NULL, &error); + test_error(error, "Unable to create buffer"); + + image_desc.buffer = buffer; + } + + cl_mem image = + clCreateImage(context, flags, format, &image_desc, nullptr, &error); + test_error(error, "Unable to create image"); + + size_t element_size = 0; + error = clGetImageInfo(image, CL_IMAGE_ELEMENT_SIZE, sizeof(element_size), + &element_size, nullptr); + test_error(error, "Error clGetImageInfo"); + + error = clReleaseMemObject(image); + test_error(error, "Unable to release image"); + + if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER) + { + error = clReleaseMemObject(buffer); + test_error(error, "Unable to release buffer"); + } + + return element_size; +} + +static inline void image_desc_init(cl_image_desc* desc, + cl_mem_object_type imageType) +{ + desc->image_type = imageType; + desc->image_width = TEST_IMAGE_SIZE; + if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType + && CL_MEM_OBJECT_IMAGE1D != imageType) + { + desc->image_height = TEST_IMAGE_SIZE; + } + if (CL_MEM_OBJECT_IMAGE3D == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + desc->image_depth = TEST_IMAGE_SIZE; + } + if (CL_MEM_OBJECT_IMAGE1D_ARRAY == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + desc->image_array_size = TEST_IMAGE_SIZE; + } +} + +#endif /* _TEST_CL_EXT_IMAGE_BUFFER */
\ No newline at end of file diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp new file mode 100644 index 00000000..2ce33a17 --- /dev/null +++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp @@ -0,0 +1,1013 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "../testBase.h" +#include "../common.h" +#include "test_cl_ext_image_buffer.hpp" + +static int get_image_requirement_alignment( + cl_device_id device, cl_context context, cl_mem_flags flags, + const cl_image_format* image_format, const cl_image_desc* image_desc, + size_t* row_pitch_alignment, size_t* slice_pitch_alignment, + size_t* base_address_alignment) +{ + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + cl_int err = CL_SUCCESS; + if (nullptr != row_pitch_alignment) + { + err = clGetImageRequirementsInfoEXT( + context, nullptr, flags, image_format, image_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(*row_pitch_alignment), row_pitch_alignment, nullptr); + test_error(err, "Error getting alignment"); + } + + if (nullptr != slice_pitch_alignment && CL_SUCCESS == err) + { + err = clGetImageRequirementsInfoEXT( + context, nullptr, flags, image_format, image_desc, + CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT, + sizeof(*slice_pitch_alignment), slice_pitch_alignment, nullptr); + test_error(err, "Error getting alignment"); + } + + if (nullptr != base_address_alignment && CL_SUCCESS == err) + { + err = clGetImageRequirementsInfoEXT( + context, nullptr, flags, image_format, image_desc, + CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT, + sizeof(*base_address_alignment), base_address_alignment, nullptr); + test_error(err, "Error getting alignment"); + } + + return TEST_PASS; +} + +/** + * Consistency with alignment requirements as returned by + * cl_khr_image2d_from_buffer Check that the returned values for + * CL_DEVICE_IMAGE_PITCH_ALIGNMENT and CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT + * are correct. + */ +int image2d_from_buffer_positive(cl_device_id device, cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_khr_image2d_from_buffer")) + { + printf("Extension cl_khr_image2d_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + cl_uint row_pitch_alignment_2d = 0; + cl_int err = + clGetDeviceInfo(device, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, + sizeof(row_pitch_alignment_2d), + &row_pitch_alignment_2d, nullptr); + test_error(err, "Error clGetDeviceInfo"); + + cl_uint base_address_alignment_2d = 0; + err = + clGetDeviceInfo(device, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, + sizeof(base_address_alignment_2d), + &base_address_alignment_2d, nullptr); + test_error(err, "Error clGetDeviceInfo"); + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + size_t row_pitch_alignment = 0; + size_t base_address_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, 0, &format, &image_desc, + &row_pitch_alignment, nullptr, &base_address_alignment); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t element_size = + get_format_size(context, &format, imageType, flag); + + /* Alignements in pixels vs bytes */ + if (base_address_alignment + > base_address_alignment_2d * element_size) + { + test_fail("Unexpected base_address_alignment"); + } + + if (row_pitch_alignment > row_pitch_alignment_2d * element_size) + { + test_fail("Unexpected row_pitch_alignment"); + } + } + } + } + + return TEST_PASS; +} + +/** + * Test clGetMemObjectInfo + * Check that CL_MEM_ASSOCIATED_MEMOBJECT correctly returns the buffer that was + * used. + */ +int memInfo_image_from_buffer_positive(cl_device_id device, cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + size_t row_pitch_alignment = 0; + size_t slice_pitch_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, 0, &format, &image_desc, + &row_pitch_alignment, &slice_pitch_alignment, nullptr); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t element_size = + get_format_size(context, &format, imageType, flag); + + const size_t row_pitch = aligned_size( + TEST_IMAGE_SIZE * element_size, row_pitch_alignment); + const size_t slice_pitch = aligned_size( + row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment); + + const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE; + + cl_int err = CL_SUCCESS; + cl_mem buffer = + clCreateBuffer(context, flag, buffer_size, nullptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = buffer; + + cl_mem image_buffer = clCreateImage(context, flag, &format, + &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + cl_mem returned_buffer; + err = clGetMemObjectInfo( + image_buffer, CL_MEM_ASSOCIATED_MEMOBJECT, + sizeof(returned_buffer), &returned_buffer, nullptr); + test_error(err, "Error clGetMemObjectInfo"); + + if (returned_buffer != buffer) + { + test_fail("Unexpected CL_MEM_ASSOCIATED_MEMOBJECT buffer"); + } + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + + err = clReleaseMemObject(image_buffer); + test_error(err, "Unable to release image"); + } + } + } + + return TEST_PASS; +} + +/** + * Test clGetImageInfo + * Check that the returned values for CL_IMAGE_ROW_PITCH and + * CL_IMAGE_SLICE_PITCH are correct. + */ +int imageInfo_image_from_buffer_positive(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + size_t row_pitch_alignment = 0; + size_t slice_pitch_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, 0, &format, &image_desc, + &row_pitch_alignment, &slice_pitch_alignment, nullptr); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t element_size = + get_format_size(context, &format, imageType, flag); + + const size_t row_pitch = aligned_size( + TEST_IMAGE_SIZE * element_size, row_pitch_alignment); + const size_t slice_pitch = aligned_size( + row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment); + + const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE; + + cl_int err = CL_SUCCESS; + cl_mem buffer = + clCreateBuffer(context, flag, buffer_size, nullptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = buffer; + + if (imageType == CL_MEM_OBJECT_IMAGE2D + || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + } + else if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + image_desc.image_slice_pitch = slice_pitch; + } + + cl_mem image_buffer = clCreateImage(context, flag, &format, + &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY + || imageType == CL_MEM_OBJECT_IMAGE2D + || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + size_t returned_row_pitch = 0; + err = clGetImageInfo(image_buffer, CL_IMAGE_ROW_PITCH, + sizeof(returned_row_pitch), + &returned_row_pitch, nullptr); + test_error(err, "Error clGetImageInfo"); + + if (returned_row_pitch != row_pitch) + { + test_fail( + "Unexpected row pitch " + "CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT"); + } + } + + if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + size_t returned_slice_pitch = 0; + err = clGetImageInfo(image_buffer, CL_IMAGE_SLICE_PITCH, + sizeof(returned_slice_pitch), + &returned_slice_pitch, nullptr); + test_error(err, "Error clGetImageInfo"); + + if (returned_slice_pitch != slice_pitch) + { + test_fail( + "Unexpected row pitch " + "CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT"); + } + } + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + + err = clReleaseMemObject(image_buffer); + test_error(err, "Unable to release image"); + } + } + } + + return TEST_PASS; +} + +/** + * Negative testing for clCreateImage and wrong alignment + * - Create an image from a buffer with invalid row pitch (not a multiple of + * required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is returned. + * - Create an image from a buffer with invalid slice pitch (not a multiple of + * required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is returned. + * - Create an image from a buffer with invalid base address alignment (not a + * multiple of required alignment) and check that CL_INVALID_IMAGE_DESCRIPTOR is + * returned + */ +int image_from_buffer_alignment_negative(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + size_t row_pitch_alignment = 0; + size_t slice_pitch_alignment = 0; + size_t base_address_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, 0, &format, &image_desc, + &row_pitch_alignment, &slice_pitch_alignment, + &base_address_alignment); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t element_size = + get_format_size(context, &format, imageType, flag); + + const size_t row_pitch = aligned_size( + TEST_IMAGE_SIZE * element_size, row_pitch_alignment); + const size_t slice_pitch = aligned_size( + row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment); + + const size_t buffer_size = (slice_pitch + 1) + * TEST_IMAGE_SIZE; /* For bigger row/slice pitch */ + + cl_int err = CL_SUCCESS; + cl_mem buffer = + clCreateBuffer(context, flag, buffer_size, nullptr, &err); + test_error(err, "Unable to create buffer"); + + /* Test Row pitch images */ + if (imageType == CL_MEM_OBJECT_IMAGE2D + || imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + image_desc.buffer = buffer; + image_desc.image_row_pitch = + row_pitch + 1; /* wrong row pitch */ + + clCreateImage(context, flag, &format, &image_desc, nullptr, + &err); + test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, + "Unexpected clCreateImage return"); + } + + /* Test Slice pitch images */ + if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + image_desc.buffer = buffer; + image_desc.image_row_pitch = row_pitch; + image_desc.image_slice_pitch = + slice_pitch + 1; /* wrong slice pitch */ + + clCreateImage(context, flag, &format, &image_desc, nullptr, + &err); + test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, + "Unexpected clCreateImage return"); + } + + /* Test buffer from host ptr to test base address alignment */ + const size_t aligned_buffer_size = + aligned_size(buffer_size, base_address_alignment); + /* Create buffer with host ptr and additional size for the wrong + * alignment */ + void* const host_ptr = + malloc(aligned_buffer_size + base_address_alignment); + void* non_aligned_host_ptr = + (void*)((char*)(aligned_ptr(host_ptr, + base_address_alignment)) + + 1); /* wrong alignment */ + + cl_mem buffer_host = clCreateBuffer( + context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, + buffer_size, non_aligned_host_ptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = buffer_host; + + clCreateImage(context, flag, &format, &image_desc, nullptr, + &err); + test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, + "Unexpected clCreateImage return"); + + free(host_ptr); + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + + err = clReleaseMemObject(buffer_host); + test_error(err, "Unable to release buffer"); + } + } + } + + return TEST_PASS; +} + +/** + * Negative testing for clCreateImage (buffer size). + * Create a buffer too small and check that image creation from that buffer is + * rejected + */ +int image_from_small_buffer_negative(cl_device_id device, cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE1D_BUFFER, CL_MEM_OBJECT_IMAGE3D, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + /* Invalid buffer size */ + cl_int err; + cl_mem buffer = clCreateBuffer( + context, flag, TEST_IMAGE_SIZE / 2, nullptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = buffer; + + clCreateImage(context, flag, &format, &image_desc, nullptr, + &err); + test_failure_error(err, CL_INVALID_MEM_OBJECT, + "Unexpected clCreateImage return"); + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + } + } + } + + return TEST_PASS; +} + +static int image_from_buffer_fill_check(cl_command_queue queue, cl_mem image, + size_t* region, size_t element_size, + char pattern) +{ + /* read the image from buffer and check the pattern */ + const size_t image_size = region[0] * region[1] * region[2] * element_size; + size_t origin[3] = { 0, 0, 0 }; + std::vector<char> read_buffer(image_size); + + cl_int error = + clEnqueueReadImage(queue, image, CL_BLOCKING, origin, region, 0, 0, + read_buffer.data(), 0, nullptr, nullptr); + test_error(error, "Error clEnqueueReadImage"); + + for (size_t line = 0; line < region[0]; line++) + { + for (size_t row = 0; row < region[1]; row++) + { + for (size_t depth = 0; depth < region[2]; depth++) + { + for (size_t elmt = 0; elmt < element_size; elmt++) + { + size_t index = line * row * depth * elmt; + + if (read_buffer[index] != pattern) + { + test_fail("Image pattern check failed"); + } + } + } + } + } + + return TEST_PASS; +} + +/** + * Use fill buffer to fill the image from buffer + */ +int image_from_buffer_fill_positive(cl_device_id device, cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + size_t row_pitch_alignment = 0; + size_t slice_pitch_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, 0, &format, &image_desc, + &row_pitch_alignment, &slice_pitch_alignment, nullptr); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t element_size = + get_format_size(context, &format, imageType, flag); + + const size_t row_pitch = aligned_size( + TEST_IMAGE_SIZE * element_size, row_pitch_alignment); + const size_t slice_pitch = aligned_size( + row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment); + + const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE; + + cl_int err = CL_SUCCESS; + cl_mem buffer = + clCreateBuffer(context, flag, buffer_size, nullptr, &err); + test_error(err, "Unable to create buffer"); + + /* fill the buffer with a pattern */ + const char pattern = 0x55; + err = clEnqueueFillBuffer(queue, buffer, &pattern, + sizeof(pattern), 0, buffer_size, 0, + nullptr, nullptr); + test_error(err, "Error clEnqueueFillBuffer"); + + err = clFinish(queue); + test_error(err, "Error clFinish"); + + cl_mem image1d_buffer; + if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER) + { + image1d_buffer = clCreateBuffer(context, flag, buffer_size, + nullptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = image1d_buffer; + } + + cl_mem image = clCreateImage(context, flag, &format, + &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + /* Check the image from buffer */ + image_desc.buffer = buffer; + + if (imageType == CL_MEM_OBJECT_IMAGE2D + || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + } + else if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + image_desc.image_slice_pitch = slice_pitch; + } + + cl_mem image_from_buffer = clCreateImage( + context, flag, &format, &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + size_t origin[3] = { 0, 0, 0 }; + size_t region[3] = { 1, 1, 1 }; + + region[0] = TEST_IMAGE_SIZE; + if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType + && CL_MEM_OBJECT_IMAGE1D != imageType) + { + region[1] = TEST_IMAGE_SIZE; + } + if (CL_MEM_OBJECT_IMAGE3D == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + region[2] = TEST_IMAGE_SIZE; + } + + /* Check the copy of the image from buffer */ + err = + clEnqueueCopyImage(queue, image_from_buffer, image, origin, + origin, region, 0, nullptr, nullptr); + test_error(err, "Error clEnqueueCopyImage"); + + err = clFinish(queue); + test_error(err, "Error clFinish"); + + int fill_error = image_from_buffer_fill_check( + queue, image_from_buffer, region, element_size, pattern); + if (TEST_PASS != fill_error) + { + return fill_error; + } + + fill_error = image_from_buffer_fill_check( + queue, image, region, element_size, pattern); + if (TEST_PASS != fill_error) + { + return fill_error; + } + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + + err = clReleaseMemObject(image); + test_error(err, "Unable to release image"); + + err = clReleaseMemObject(image_from_buffer); + test_error(err, "Unable to release image"); + + if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER) + { + err = clReleaseMemObject(image1d_buffer); + test_error(err, "Unable to release image"); + } + } + } + } + + return TEST_PASS; +} + +static int image_from_buffer_read_check(cl_command_queue queue, cl_mem buffer, + const size_t buffer_size, + size_t* region, size_t element_size, + char pattern, size_t row_pitch, + size_t slice_pitch) +{ + /* read the buffer and check the pattern */ + std::vector<char> host_buffer(buffer_size); + char* host_ptr = host_buffer.data(); + char* host_ptr_slice = host_ptr; + + cl_int error = + clEnqueueReadBuffer(queue, buffer, CL_BLOCKING, 0, buffer_size, + host_buffer.data(), 0, nullptr, nullptr); + test_error(error, "Error clEnqueueReadBuffer"); + + for (size_t k = 0; k < region[2]; k++) + { + for (size_t i = 0; i < region[1]; i++) + { + for (size_t j = 0; j < region[0] * element_size; j++) + { + if (host_ptr[j] != pattern) + { + test_fail("Image pattern check failed"); + } + } + host_ptr = host_ptr + row_pitch; + } + host_ptr_slice = host_ptr_slice + slice_pitch; + host_ptr = host_ptr_slice; + } + + return TEST_PASS; +} + +/** + * Use fill image to fill the buffer that was used to create the image + */ +int image_from_buffer_read_positive(cl_device_id device, cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + for (auto imageType : imageTypes) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + /* Non normalized format so we can read it back directly from + * clEnqueueFillImage */ + cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT8 }; + const char pattern = 0x55; + + const size_t element_size = + get_format_size(context, &format, imageType, CL_MEM_READ_WRITE); + + size_t row_pitch_alignment = 0; + size_t slice_pitch_alignment = 0; + + int get_error = get_image_requirement_alignment( + device, context, CL_MEM_READ_WRITE, &format, &image_desc, + &row_pitch_alignment, &slice_pitch_alignment, nullptr); + if (TEST_PASS != get_error) + { + return get_error; + } + + const size_t row_pitch = + aligned_size(TEST_IMAGE_SIZE * element_size, row_pitch_alignment); + const size_t slice_pitch = + aligned_size(row_pitch * TEST_IMAGE_SIZE, slice_pitch_alignment); + + const size_t buffer_size = slice_pitch * TEST_IMAGE_SIZE; + + cl_int err = CL_SUCCESS; + cl_mem buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, + nullptr, &err); + test_error(err, "Unable to create buffer"); + + /* Check the image from buffer */ + image_desc.buffer = buffer; + + if (imageType == CL_MEM_OBJECT_IMAGE2D + || imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + } + else if (imageType == CL_MEM_OBJECT_IMAGE3D + || imageType == CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + image_desc.image_row_pitch = row_pitch; + image_desc.image_slice_pitch = slice_pitch; + } + + cl_mem image = clCreateImage(context, CL_MEM_READ_WRITE, &format, + &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + size_t origin[3] = { 0, 0, 0 }; + size_t region[3] = { 1, 1, 1 }; + + region[0] = TEST_IMAGE_SIZE; + if (CL_MEM_OBJECT_IMAGE1D_BUFFER != imageType + && CL_MEM_OBJECT_IMAGE1D != imageType) + { + region[1] = TEST_IMAGE_SIZE; + } + if (CL_MEM_OBJECT_IMAGE3D == imageType + || CL_MEM_OBJECT_IMAGE2D_ARRAY == imageType) + { + region[2] = TEST_IMAGE_SIZE; + } + + /* fill the image with a pattern */ + cl_uint fill_color[4] = { pattern, pattern, pattern, pattern }; + err = clEnqueueFillImage(queue, image, fill_color, origin, region, 0, + nullptr, nullptr); + test_error(err, "Error clEnqueueFillImage"); + + err = clFinish(queue); + test_error(err, "Error clFinish"); + + int read_error = image_from_buffer_read_check( + queue, buffer, buffer_size, region, element_size, pattern, + (imageType == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? slice_pitch + : row_pitch, + slice_pitch); + if (TEST_PASS != read_error) + { + return read_error; + } + + err = clReleaseMemObject(buffer); + test_error(err, "Unable to release buffer"); + + err = clReleaseMemObject(image); + test_error(err, "Unable to release image"); + } + + return TEST_PASS; +}
\ No newline at end of file diff --git a/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp b/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp new file mode 100644 index 00000000..9212fcbc --- /dev/null +++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_requirements_info.cpp @@ -0,0 +1,482 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "../testBase.h" +#include "../common.h" +#include "test_cl_ext_image_buffer.hpp" + +/** + * Negative tests for {CL_IMAGE_REQUIREMENTS_SIZE_EXT} + * Check that attempting to perform the {CL_IMAGE_REQUIREMENTS_SIZE_EXT} query + * without specifying the _image_format_ results in {CL_INVALID_VALUE} being + * returned. Check that attempting to perform the + * {CL_IMAGE_REQUIREMENTS_SIZE_EXT} query without specifying the _image_desc_ + * results in {CL_INVALID_VALUE} being returned. + */ +int cl_image_requirements_size_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + size_t max_size = 0; + size_t param_val_size = 0; + + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, CL_MEM_OBJECT_IMAGE2D); + + cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 }; + + /* Check image_format null results in CL_INVALID_VALUE */ + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc, + CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size, + ¶m_val_size); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check image_desc null results in CL_INVALID_VALUE */ + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, nullptr, + CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size, + ¶m_val_size); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + return TEST_PASS; +} + +/** + * Consistency checks for CL_IMAGE_REQUIREMENTS_SIZE_EXT + * When creating 2D images from a buffer is supported + * Check that the CL_IMAGE_REQUIREMENTS_SIZE_EXT query can be performed + * successfully. Create a buffer with the size returned and check that an image + * can successfully be created from the buffer. Check that the value returned + * for CL_MEM_SIZE for the image is the same as the value returned for + * CL_IMAGE_REQUIREMENTS_SIZE_EXT. + */ +int cl_image_requirements_size_ext_consistency(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + if (!is_extension_available(device, "cl_ext_image_from_buffer")) + { + printf("Extension cl_ext_image_from_buffer not available"); + return TEST_SKIPPED_ITSELF; + } + + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + size_t max_size = 0; + size_t param_val_size = 0; + + std::vector<cl_mem_object_type> imageTypes{ + CL_MEM_OBJECT_IMAGE1D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE2D_ARRAY + }; + + std::vector<cl_mem_flags> flagTypes{ CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY, + CL_MEM_READ_WRITE, + CL_MEM_KERNEL_READ_AND_WRITE }; + + for (auto flag : flagTypes) + { + for (auto imageType : imageTypes) + { + /* Get the list of supported image formats */ + std::vector<cl_image_format> formatList; + if (TEST_PASS + != get_format_list(context, imageType, formatList, flag) + || formatList.size() == 0) + { + test_fail("Failure to get supported formats list"); + } + + for (auto format : formatList) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + flag = (flag == CL_MEM_KERNEL_READ_AND_WRITE) + ? CL_MEM_READ_WRITE + : flag; + + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, flag, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_SIZE_EXT, sizeof(max_size), &max_size, + ¶m_val_size); + test_error(err, "Error clGetImageRequirementsInfoEXT"); + + /* Create buffer */ + cl_mem buffer = + clCreateBuffer(context, flag, max_size, nullptr, &err); + test_error(err, "Unable to create buffer"); + + image_desc.buffer = buffer; + + /* 2D Image from buffer */ + cl_mem image_buffer = clCreateImage(context, flag, &format, + &image_desc, nullptr, &err); + test_error(err, "Unable to create image"); + + size_t size = 0; + err = clGetMemObjectInfo(image_buffer, CL_MEM_SIZE, + sizeof(size_t), &size, NULL); + test_error(err, "Error clGetMemObjectInfo"); + + if (max_size != size) + { + test_fail("CL_IMAGE_REQUIREMENTS_SIZE_EXT different from " + "CL_MEM_SIZE"); + } + + err = clReleaseMemObject(image_buffer); + test_error(err, "Error clReleaseMemObject"); + + err = clReleaseMemObject(buffer); + test_error(err, "Error clReleaseMemObject"); + } + } + } + + return TEST_PASS; +} + +/** + * Negative testing for all testable error codes returned by + * clGetImageFormatInfoKHR + */ +int clGetImageRequirementsInfoEXT_negative(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, CL_MEM_OBJECT_IMAGE3D); + + cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 }; + + /* Check that CL_INVALID_CONTEXT is returned when passing nullptr as context + */ + size_t row_pitch_alignment = 0; + cl_int err = clGetImageRequirementsInfoEXT( + nullptr, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_CONTEXT, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_VALUE is returned when passing an invalid + * image_type */ + cl_image_desc invalid_desc = { CL_MEM_OBJECT_BUFFER, TEST_IMAGE_SIZE }; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &invalid_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_VALUE is returned when passing invalid flags */ + err = clGetImageRequirementsInfoEXT( + context, nullptr, -1, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_IMAGE_FORMAT_DESCRIPTOR is returned when passing a + * nullptr image_format */ + cl_image_format invalid_format = { CL_INTENSITY, CL_UNORM_SHORT_555 }; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &invalid_format, &image_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_IMAGE_DESCRIPTOR is returned when passing an + * image_desc with invalid values */ + cl_image_desc invalid_desc_size = { CL_MEM_OBJECT_IMAGE1D, 0 }; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &invalid_desc_size, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_VALUE is returned when passing an invalid + * param_name */ + cl_image_requirements_info_ext invalid_info = CL_IMAGE_FORMAT; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, invalid_info, + sizeof(row_pitch_alignment), &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_VALUE is returned when passing a param_value_size + * value smaller than the size of the return type */ + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, + sizeof(row_pitch_alignment) - 1, &row_pitch_alignment, nullptr); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + /* Check that CL_INVALID_VALUE is returned when passing a param_value_size + * value smaller than the size of the return type */ + uint32_t max_height = 0; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(max_height) - 1, + &max_height, nullptr); + test_failure_error(err, CL_INVALID_VALUE, + "Unexpected clGetImageRequirementsInfoEXT return"); + + return TEST_PASS; +} + +/** + * Negative tests for {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} + * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} query on all + * image types for which it is not valid Check that + * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases. + * + * Negative testing for {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} + * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} query on all + * image types for which it is not valid Check that + * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases. + * + * Negative testing for {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} + * Attempt to perform the {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} query on + * all image types for which it is not valid Check that + * {CL_INVALID_IMAGE_DESCRIPTOR} is returned in all cases. + */ +int cl_image_requirements_max_val_ext_negative(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + size_t value = 0; + + std::vector<cl_mem_object_type> imageTypes_height{ + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D + }; + + cl_image_format format = { CL_RGBA, CL_UNSIGNED_INT16 }; + + for (auto imageType : imageTypes_height) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + /* Check image_format null results in CL_INVALID_VALUE */ + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(value), &value, + nullptr); + test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + } + + std::vector<cl_mem_object_type> imageTypes_depth{ + CL_MEM_OBJECT_IMAGE2D, CL_MEM_OBJECT_IMAGE2D_ARRAY, + CL_MEM_OBJECT_IMAGE1D_ARRAY, CL_MEM_OBJECT_IMAGE1D_BUFFER, + CL_MEM_OBJECT_IMAGE1D + }; + + for (auto imageType : imageTypes_depth) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + /* Check image_format null results in CL_INVALID_VALUE */ + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, sizeof(value), &value, + nullptr); + test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + } + + std::vector<cl_mem_object_type> imageTypes_array_size{ + CL_MEM_OBJECT_IMAGE3D, CL_MEM_OBJECT_IMAGE2D, + CL_MEM_OBJECT_IMAGE1D_BUFFER, CL_MEM_OBJECT_IMAGE1D + }; + + for (auto imageType : imageTypes_array_size) + { + cl_image_desc image_desc = { 0 }; + image_desc_init(&image_desc, imageType); + + /* Check image_format null results in CL_INVALID_VALUE */ + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, &format, &image_desc, + CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, sizeof(value), &value, + nullptr); + test_failure_error(err, CL_INVALID_IMAGE_DESCRIPTOR, + "Unexpected clGetImageRequirementsInfoEXT return"); + } + + return TEST_PASS; +} + +/** + * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT} + ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT} query can be performed + *successfully + * + * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} + ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT} query can be performed + *successfully + * + * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} + ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT} query can be performed + *successfully + * + * Consistency checks for {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} + ** Check that the {CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT} query can be + *performed successfully + */ +int cl_image_requirements_max_val_ext_positive(cl_device_id device, + cl_context context, + cl_command_queue queue) +{ + if (!is_extension_available(device, "cl_ext_image_requirements_info")) + { + printf("Extension cl_ext_image_requirements_info not available"); + return TEST_SKIPPED_ITSELF; + } + + cl_platform_id platform = getPlatformFromDevice(device); + GET_EXTENSION_FUNC(platform, clGetImageRequirementsInfoEXT); + + /* CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT */ + cl_image_desc image_desc_1d = { 0 }; + image_desc_init(&image_desc_1d, CL_MEM_OBJECT_IMAGE1D); + + uint32_t max_width = 0; + cl_int err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_1d, + CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT, sizeof(max_width), &max_width, + nullptr); + test_error(err, "Error clGetImageRequirementsInfoEXT"); + + size_t width_1d = 0; + err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, + sizeof(width_1d), &width_1d, NULL); + test_error(err, "Error clGetDeviceInfo"); + + if (!(max_width <= width_1d && max_width > 0)) + { + test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT value"); + } + + /* CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT */ + cl_image_desc image_desc_2d = { 0 }; + image_desc_init(&image_desc_2d, CL_MEM_OBJECT_IMAGE2D); + + uint32_t max_height = 0; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_2d, + CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, sizeof(max_height), &max_height, + nullptr); + test_error(err, "Error clGetImageRequirementsInfoEXT"); + + size_t height_2d = 0; + err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(height_2d), &height_2d, NULL); + test_error(err, "Error clGetDeviceInfo"); + + if (!(max_height <= height_2d && max_height > 0)) + { + test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT value"); + } + + /* CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT */ + cl_image_desc image_desc_3d = { 0 }; + image_desc_init(&image_desc_3d, CL_MEM_OBJECT_IMAGE3D); + + uint32_t max_depth = 0; + err = clGetImageRequirementsInfoEXT(context, nullptr, CL_MEM_READ_WRITE, + nullptr, &image_desc_3d, + CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, + sizeof(max_depth), &max_depth, nullptr); + test_error(err, "Error clGetImageRequirementsInfoEXT"); + + size_t depth_3d = 0; + err = clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(depth_3d), + &depth_3d, NULL); + test_error(err, "Error clGetDeviceInfo"); + + if (!(max_depth <= depth_3d && max_depth > 0)) + { + test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT value"); + } + + /* CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT */ + cl_image_desc image_desc_array = { 0 }; + image_desc_init(&image_desc_array, CL_MEM_OBJECT_IMAGE2D_ARRAY); + + uint32_t max_array_size = 0; + err = clGetImageRequirementsInfoEXT( + context, nullptr, CL_MEM_READ_WRITE, nullptr, &image_desc_array, + CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, sizeof(max_array_size), + &max_array_size, nullptr); + test_error(err, "Error clGetImageRequirementsInfoEXT"); + + size_t array_size = 0; + err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, + sizeof(array_size), &array_size, NULL); + test_error(err, "Error clGetDeviceInfo"); + + if (!(max_array_size <= array_size && max_array_size > 0)) + { + test_fail("Unexpected CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT value"); + } + + return TEST_PASS; +}
\ No newline at end of file diff --git a/test_conformance/images/kernel_read_write/test_common.cpp b/test_conformance/images/kernel_read_write/test_common.cpp index e76710b5..a22db195 100644 --- a/test_conformance/images/kernel_read_write/test_common.cpp +++ b/test_conformance/images/kernel_read_write/test_common.cpp @@ -16,6 +16,7 @@ #include "test_common.h" +#include <algorithm> cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool test_mipmaps, cl_int *error) { cl_sampler sampler = nullptr; @@ -33,122 +34,210 @@ cl_sampler create_sampler(cl_context context, image_sampler_data *sdata, bool te return sampler; } -void InitFloatCoordsCommon(image_descriptor *imageInfo, - image_sampler_data *imageSampler, float *xOffsets, - float *yOffsets, float *zOffsets, float xfract, - float yfract, float zfract, int normalized_coords, - MTdata d, int lod) +bool get_image_dimensions(image_descriptor *imageInfo, size_t &width, + size_t &height, size_t &depth) +{ + width = imageInfo->width; + height = 1; + depth = 1; + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE1D: break; + case CL_MEM_OBJECT_IMAGE1D_ARRAY: height = imageInfo->arraySize; break; + case CL_MEM_OBJECT_IMAGE2D: height = imageInfo->height; break; + case CL_MEM_OBJECT_IMAGE2D_ARRAY: + height = imageInfo->height; + depth = imageInfo->arraySize; + break; + case CL_MEM_OBJECT_IMAGE3D: + height = imageInfo->height; + depth = imageInfo->depth; + break; + default: + log_error("ERROR: Test does not support image type"); + return TEST_FAIL; + } + return 0; +} + +static bool InitFloatCoordsCommon(image_descriptor *imageInfo, + image_sampler_data *imageSampler, + float *xOffsets, float *yOffsets, + float *zOffsets, float xfract, float yfract, + float zfract, int normalized_coords, MTdata d, + int lod) { size_t i = 0; - if (gDisableOffsets) + size_t width_loop, height_loop, depth_loop; + bool error = + get_image_dimensions(imageInfo, width_loop, height_loop, depth_loop); + if (!error) { - for (size_t z = 0; z < imageInfo->depth; z++) + if (gDisableOffsets) { - for (size_t y = 0; y < imageInfo->height; y++) + for (size_t z = 0; z < depth_loop; z++) { - for (size_t x = 0; x < imageInfo->width; x++, i++) + for (size_t y = 0; y < height_loop; y++) { - xOffsets[i] = (float)(xfract + (double)x); - yOffsets[i] = (float)(yfract + (double)y); - zOffsets[i] = (float)(zfract + (double)z); + for (size_t x = 0; x < width_loop; x++, i++) + { + xOffsets[i] = (float)(xfract + (double)x); + yOffsets[i] = (float)(yfract + (double)y); + zOffsets[i] = (float)(zfract + (double)z); + } } } } - } - else - { - for (size_t z = 0; z < imageInfo->depth; z++) + else { - for (size_t y = 0; y < imageInfo->height; y++) + for (size_t z = 0; z < depth_loop; z++) { - for (size_t x = 0; x < imageInfo->width; x++, i++) + for (size_t y = 0; y < height_loop; y++) { - xOffsets[i] = - (float)(xfract - + (double)((int)x - + random_in_range(-10, 10, d))); - yOffsets[i] = - (float)(yfract - + (double)((int)y - + random_in_range(-10, 10, d))); - zOffsets[i] = - (float)(zfract - + (double)((int)z - + random_in_range(-10, 10, d))); + for (size_t x = 0; x < width_loop; x++, i++) + { + xOffsets[i] = + (float)(xfract + + (double)((int)x + + random_in_range(-10, 10, d))); + yOffsets[i] = + (float)(yfract + + (double)((int)y + + random_in_range(-10, 10, d))); + zOffsets[i] = + (float)(zfract + + (double)((int)z + + random_in_range(-10, 10, d))); + } } } } - } - if (imageSampler->addressing_mode == CL_ADDRESS_NONE) - { - i = 0; - for (size_t z = 0; z < imageInfo->depth; z++) + if (imageSampler->addressing_mode == CL_ADDRESS_NONE) { - for (size_t y = 0; y < imageInfo->height; y++) + i = 0; + for (size_t z = 0; z < depth_loop; z++) { - for (size_t x = 0; x < imageInfo->width; x++, i++) + for (size_t y = 0; y < height_loop; y++) { - xOffsets[i] = (float)CLAMP((double)xOffsets[i], 0.0, - (double)imageInfo->width - 1.0); - yOffsets[i] = (float)CLAMP((double)yOffsets[i], 0.0, - (double)imageInfo->height - 1.0); - zOffsets[i] = (float)CLAMP((double)zOffsets[i], 0.0, - (double)imageInfo->depth - 1.0); + for (size_t x = 0; x < width_loop; x++, i++) + { + xOffsets[i] = (float)CLAMP((double)xOffsets[i], 0.0, + (double)width_loop - 1.0); + yOffsets[i] = (float)CLAMP((double)yOffsets[i], 0.0, + (double)height_loop - 1.0); + zOffsets[i] = (float)CLAMP((double)zOffsets[i], 0.0, + (double)depth_loop - 1.0); + } } } } - } - if (normalized_coords || gTestMipmaps) - { - i = 0; - if (lod == 0) + if (normalized_coords || gTestMipmaps) { - for (size_t z = 0; z < imageInfo->depth; z++) + i = 0; + if (lod == 0) { - for (size_t y = 0; y < imageInfo->height; y++) + for (size_t z = 0; z < depth_loop; z++) { - for (size_t x = 0; x < imageInfo->width; x++, i++) + for (size_t y = 0; y < height_loop; y++) { - xOffsets[i] = (float)((double)xOffsets[i] - / (double)imageInfo->width); - yOffsets[i] = (float)((double)yOffsets[i] - / (double)imageInfo->height); - zOffsets[i] = (float)((double)zOffsets[i] - / (double)imageInfo->depth); + for (size_t x = 0; x < width_loop; x++, i++) + { + xOffsets[i] = (float)((double)xOffsets[i] + / (double)width_loop); + if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + yOffsets[i] = (float)((double)yOffsets[i] + / (double)height_loop); + } + if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + zOffsets[i] = (float)((double)zOffsets[i] + / (double)depth_loop); + } + } } } } - } - else if (gTestMipmaps) - { - size_t width_lod, height_lod, depth_lod; - - width_lod = - (imageInfo->width >> lod) ? (imageInfo->width >> lod) : 1; - height_lod = - (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1; - depth_lod = - (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1; - - for (size_t z = 0; z < depth_lod; z++) + else if (gTestMipmaps) { - for (size_t y = 0; y < height_lod; y++) + size_t width_lod = + (width_loop >> lod) ? (width_loop >> lod) : 1; + size_t height_lod = height_loop; + size_t depth_lod = depth_loop; + if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + height_lod = + (height_loop >> lod) ? (height_loop >> lod) : 1; + } + if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) { - for (size_t x = 0; x < width_lod; x++, i++) + depth_lod = (depth_loop >> lod) ? (depth_loop >> lod) : 1; + } + + for (size_t z = 0; z < depth_lod; z++) + { + for (size_t y = 0; y < height_lod; y++) { - xOffsets[i] = - (float)((double)xOffsets[i] / (double)width_lod); - yOffsets[i] = - (float)((double)yOffsets[i] / (double)height_lod); - zOffsets[i] = - (float)((double)zOffsets[i] / (double)depth_lod); + for (size_t x = 0; x < width_lod; x++, i++) + { + xOffsets[i] = (float)((double)xOffsets[i] + / (double)width_lod); + if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + yOffsets[i] = (float)((double)yOffsets[i] + / (double)height_lod); + } + if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + zOffsets[i] = (float)((double)zOffsets[i] + / (double)depth_lod); + } + } } } } } } + return error; +} + +cl_mem create_image_of_type(cl_context context, cl_mem_flags mem_flags, + image_descriptor *imageInfo, size_t row_pitch, + size_t slice_pitch, void *host_ptr, cl_int *error) +{ + cl_mem image; + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE3D: + image = create_image_3d(context, mem_flags, imageInfo->format, + imageInfo->width, imageInfo->height, + imageInfo->depth, row_pitch, slice_pitch, + host_ptr, error); + break; + default: + log_error("Implementation is incomplete, only 3D images are " + "supported so far"); + return nullptr; + } + return image; +} + +static size_t get_image_num_pixels(image_descriptor *imageInfo, size_t width, + size_t height, size_t depth, + size_t array_size) +{ + size_t image_size; + switch (imageInfo->type) + { + case CL_MEM_OBJECT_IMAGE3D: image_size = width * height * depth; break; + default: + log_error("Implementation is incomplete, only 3D images are " + "supported so far"); + return 0; + } + return image_size; } int test_read_image(cl_context context, cl_command_queue queue, @@ -160,6 +249,17 @@ int test_read_image(cl_context context, cl_command_queue queue, size_t threads[3]; static int initHalf = 0; + size_t image_size = + get_image_num_pixels(imageInfo, imageInfo->width, imageInfo->height, + imageInfo->depth, imageInfo->arraySize); + test_assert_error(0 != image_size, "Invalid image size"); + size_t width_size, height_size, depth_size; + if (get_image_dimensions(imageInfo, width_size, height_size, depth_size)) + { + log_error("ERROR: invalid image dimensions"); + return CL_INVALID_VALUE; + } + cl_mem_flags image_read_write_flags = CL_MEM_READ_ONLY; clMemWrapper xOffsets, yOffsets, zOffsets, results; @@ -168,14 +268,11 @@ int test_read_image(cl_context context, cl_command_queue queue, // Create offset data BufferOwningPtr<cl_float> xOffsetValues( - malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height - * imageInfo->depth)); + malloc(sizeof(cl_float) * image_size)); BufferOwningPtr<cl_float> yOffsetValues( - malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height - * imageInfo->depth)); + malloc(sizeof(cl_float) * image_size)); BufferOwningPtr<cl_float> zOffsetValues( - malloc(sizeof(cl_float) * imageInfo->width * imageInfo->height - * imageInfo->depth)); + malloc(sizeof(cl_float) * image_size)); if (imageInfo->format->image_channel_data_type == CL_HALF_FLOAT) if (DetectFloatToHalfRoundingMode(queue)) return 1; @@ -206,26 +303,27 @@ int test_read_image(cl_context context, cl_command_queue queue, { generate_random_image_data(imageInfo, maxImageUseHostPtrBackingStore, d); - unprotImage = create_image_3d( + unprotImage = create_image_of_type( context, image_read_write_flags | CL_MEM_USE_HOST_PTR, - imageInfo->format, imageInfo->width, imageInfo->height, - imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0), + imageInfo, (gEnablePitch ? imageInfo->rowPitch : 0), (gEnablePitch ? imageInfo->slicePitch : 0), maxImageUseHostPtrBackingStore, &error); } else { - error = protImage.Create(context, image_read_write_flags, - imageInfo->format, imageInfo->width, - imageInfo->height, imageInfo->depth); + error = protImage.Create(context, imageInfo->type, + image_read_write_flags, imageInfo->format, + imageInfo->width, imageInfo->height, + imageInfo->depth, imageInfo->arraySize); } if (error != CL_SUCCESS) { - log_error("ERROR: Unable to create 3D image of size %d x %d x %d " + log_error("ERROR: Unable to create image of size %d x %d x %d x %d " "(pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, - (int)imageInfo->depth, (int)imageInfo->rowPitch, - (int)imageInfo->slicePitch, IGetErrorString(error)); + (int)imageInfo->depth, (int)imageInfo->arraySize, + (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, + IGetErrorString(error)); return error; } if (gTestMaxImages) @@ -237,18 +335,18 @@ int test_read_image(cl_context context, cl_command_queue queue, { // Don't use clEnqueueWriteImage; just use copy host ptr to get the data // in - unprotImage = create_image_3d( - context, image_read_write_flags | CL_MEM_COPY_HOST_PTR, - imageInfo->format, imageInfo->width, imageInfo->height, - imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0), + unprotImage = create_image_of_type( + context, image_read_write_flags | CL_MEM_COPY_HOST_PTR, imageInfo, + (gEnablePitch ? imageInfo->rowPitch : 0), (gEnablePitch ? imageInfo->slicePitch : 0), imageValues, &error); if (error != CL_SUCCESS) { - log_error("ERROR: Unable to create 3D image of size %d x %d x %d " + log_error("ERROR: Unable to create image of size %d x %d x %d x %d " "(pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, - (int)imageInfo->depth, (int)imageInfo->rowPitch, - (int)imageInfo->slicePitch, IGetErrorString(error)); + (int)imageInfo->depth, (int)imageInfo->arraySize, + (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, + IGetErrorString(error)); return error; } image = unprotImage; @@ -260,19 +358,19 @@ int test_read_image(cl_context context, cl_command_queue queue, // specified, so we just do the same thing either way if (!gTestMipmaps) { - unprotImage = create_image_3d( - context, image_read_write_flags | gMemFlagsToUse, - imageInfo->format, imageInfo->width, imageInfo->height, - imageInfo->depth, (gEnablePitch ? imageInfo->rowPitch : 0), + unprotImage = create_image_of_type( + context, image_read_write_flags | gMemFlagsToUse, imageInfo, + (gEnablePitch ? imageInfo->rowPitch : 0), (gEnablePitch ? imageInfo->slicePitch : 0), imageValues, &error); if (error != CL_SUCCESS) { - log_error("ERROR: Unable to create 3D image of size %d x %d x " - "%d (pitch %d, %d ) (%s)", + log_error("ERROR: Unable to create image of size %d x %d x " + "%d x %d (pitch %d, %d ) (%s)", (int)imageInfo->width, (int)imageInfo->height, - (int)imageInfo->depth, (int)imageInfo->rowPitch, - (int)imageInfo->slicePitch, IGetErrorString(error)); + (int)imageInfo->depth, (int)imageInfo->arraySize, + (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, + IGetErrorString(error)); return error; } image = unprotImage; @@ -280,10 +378,11 @@ int test_read_image(cl_context context, cl_command_queue queue, else { cl_image_desc image_desc = { 0 }; - image_desc.image_type = CL_MEM_OBJECT_IMAGE3D; + image_desc.image_type = imageInfo->type; image_desc.image_width = imageInfo->width; image_desc.image_height = imageInfo->height; image_desc.image_depth = imageInfo->depth; + image_desc.image_array_size = imageInfo->arraySize; image_desc.num_mip_levels = imageInfo->num_mip_levels; @@ -292,23 +391,24 @@ int test_read_image(cl_context context, cl_command_queue queue, imageInfo->format, &image_desc, NULL, &error); if (error != CL_SUCCESS) { - log_error("ERROR: Unable to create %d level mipmapped 3D image " - "of size %d x %d x %d (pitch %d, %d ) (%s)", + log_error("ERROR: Unable to create %d level mipmapped image " + "of size %d x %d x %d x %d (pitch %d, %d ) (%s)", (int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, (int)imageInfo->depth, - (int)imageInfo->rowPitch, (int)imageInfo->slicePitch, - IGetErrorString(error)); + (int)imageInfo->arraySize, (int)imageInfo->rowPitch, + (int)imageInfo->slicePitch, IGetErrorString(error)); return error; } image = unprotImage; } } + test_assert_error(nullptr != image, "Image creation failed"); + if (gMemFlagsToUse != CL_MEM_COPY_HOST_PTR) { size_t origin[4] = { 0, 0, 0, 0 }; - size_t region[3] = { imageInfo->width, imageInfo->height, - imageInfo->depth }; + size_t region[3] = { width_size, height_size, depth_size }; if (gDebugTrace) log_info(" - Writing image...\n"); @@ -323,10 +423,10 @@ int test_read_image(cl_context context, cl_command_queue queue, if (error != CL_SUCCESS) { - log_error("ERROR: Unable to write to 3D image of size %d x %d " - "x %d \n", + log_error("ERROR: Unable to write to image of size %d x %d " + "x %d x %d\n", (int)imageInfo->width, (int)imageInfo->height, - (int)imageInfo->depth); + (int)imageInfo->depth, (int)imageInfo->arraySize); return error; } } @@ -338,17 +438,15 @@ int test_read_image(cl_context context, cl_command_queue queue, { origin[3] = i; error = clEnqueueWriteImage( - queue, image, CL_TRUE, origin, region, - /*gEnablePitch ? imageInfo->rowPitch :*/ 0, - /*gEnablePitch ? imageInfo->slicePitch :*/ 0, + queue, image, CL_TRUE, origin, region, 0, 0, ((char *)imageValues + nextLevelOffset), 0, NULL, NULL); if (error != CL_SUCCESS) { - log_error("ERROR: Unable to write to %d level mipmapped 3D " - "image of size %d x %d x %d\n", + log_error("ERROR: Unable to write to %d level mipmapped " + "image of size %d x %d x %d x %d\n", (int)imageInfo->num_mip_levels, (int)imageInfo->width, (int)imageInfo->height, - (int)imageInfo->depth); + (int)imageInfo->arraySize, (int)imageInfo->depth); return error; } nextLevelOffset += region[0] * region[1] * region[2] @@ -361,26 +459,21 @@ int test_read_image(cl_context context, cl_command_queue queue, } } - xOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, - sizeof(cl_float) * imageInfo->width - * imageInfo->height * imageInfo->depth, - xOffsetValues, &error); + xOffsets = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * image_size, xOffsetValues, &error); test_error(error, "Unable to create x offset buffer"); - yOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, - sizeof(cl_float) * imageInfo->width - * imageInfo->height * imageInfo->depth, - yOffsetValues, &error); + yOffsets = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * image_size, yOffsetValues, &error); test_error(error, "Unable to create y offset buffer"); - zOffsets = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, - sizeof(cl_float) * imageInfo->width - * imageInfo->height * imageInfo->depth, - zOffsetValues, &error); + zOffsets = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * image_size, zOffsetValues, &error); test_error(error, "Unable to create y offset buffer"); - results = - clCreateBuffer(context, CL_MEM_READ_WRITE, - get_explicit_type_size(outputType) * 4 * imageInfo->width - * imageInfo->height * imageInfo->depth, - NULL, &error); + results = clCreateBuffer( + context, CL_MEM_READ_WRITE, + get_explicit_type_size(outputType) * 4 * image_size, NULL, &error); test_error(error, "Unable to create result buffer"); // Create sampler to use @@ -443,16 +536,19 @@ int test_read_image(cl_context context, cl_command_queue queue, } int nextLevelOffset = 0; - size_t width_lod = imageInfo->width, height_lod = imageInfo->height, - depth_lod = imageInfo->depth; + size_t width_lod = width_size, height_lod = height_size, + depth_lod = depth_size; // Loop over all mipmap levels, if we are testing mipmapped images. for (int lod = 0; (gTestMipmaps && lod < imageInfo->num_mip_levels) || (!gTestMipmaps && lod < 1); lod++) { - size_t resultValuesSize = width_lod * height_lod * depth_lod - * get_explicit_type_size(outputType) * 4; + size_t image_lod_size = get_image_num_pixels( + imageInfo, width_lod, height_lod, depth_lod, imageInfo->arraySize); + test_assert_error(0 != image_lod_size, "Invalid image size"); + size_t resultValuesSize = + image_lod_size * get_explicit_type_size(outputType) * 4; BufferOwningPtr<char> resultValues(malloc(resultValuesSize)); float lod_float = (float)lod; if (gTestMipmaps) @@ -468,30 +564,25 @@ int test_read_image(cl_context context, cl_command_queue queue, float offset = float_offsets[q % float_offset_count]; // Init the coordinates - InitFloatCoordsCommon(imageInfo, imageSampler, xOffsetValues, - yOffsetValues, zOffsetValues, - q >= float_offset_count ? -offset : offset, - q >= float_offset_count ? offset : -offset, - q >= float_offset_count ? -offset : offset, - imageSampler->normalized_coords, d, lod); - - error = - clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0, - sizeof(cl_float) * imageInfo->height - * imageInfo->width * imageInfo->depth, - xOffsetValues, 0, NULL, NULL); + error = InitFloatCoordsCommon( + imageInfo, imageSampler, xOffsetValues, yOffsetValues, + zOffsetValues, q >= float_offset_count ? -offset : offset, + q >= float_offset_count ? offset : -offset, + q >= float_offset_count ? -offset : offset, + imageSampler->normalized_coords, d, lod); + test_error(error, "Unable to initialise coordinates"); + + error = clEnqueueWriteBuffer(queue, xOffsets, CL_TRUE, 0, + sizeof(cl_float) * image_size, + xOffsetValues, 0, NULL, NULL); test_error(error, "Unable to write x offsets"); - error = - clEnqueueWriteBuffer(queue, yOffsets, CL_TRUE, 0, - sizeof(cl_float) * imageInfo->height - * imageInfo->width * imageInfo->depth, - yOffsetValues, 0, NULL, NULL); + error = clEnqueueWriteBuffer(queue, yOffsets, CL_TRUE, 0, + sizeof(cl_float) * image_size, + yOffsetValues, 0, NULL, NULL); test_error(error, "Unable to write y offsets"); - error = - clEnqueueWriteBuffer(queue, zOffsets, CL_TRUE, 0, - sizeof(cl_float) * imageInfo->height - * imageInfo->width * imageInfo->depth, - zOffsetValues, 0, NULL, NULL); + error = clEnqueueWriteBuffer(queue, zOffsets, CL_TRUE, 0, + sizeof(cl_float) * image_size, + zOffsetValues, 0, NULL, NULL); test_error(error, "Unable to write z offsets"); @@ -510,11 +601,10 @@ int test_read_image(cl_context context, cl_command_queue queue, test_error(error, "Unable to run kernel"); // Get results - error = clEnqueueReadBuffer(queue, results, CL_TRUE, 0, - width_lod * height_lod * depth_lod - * get_explicit_type_size(outputType) - * 4, - resultValues, 0, NULL, NULL); + error = clEnqueueReadBuffer( + queue, results, CL_TRUE, 0, + image_lod_size * get_explicit_type_size(outputType) * 4, + resultValues, 0, NULL, NULL); test_error(error, "Unable to read results from kernel"); if (gDebugTrace) log_info(" results read\n"); @@ -556,7 +646,7 @@ int test_read_image(cl_context context, cl_command_queue queue, // Apple requires its CPU implementation to do // correctly rounded address arithmetic in all // modes - || gDeviceType != CL_DEVICE_TYPE_GPU + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif ) offset = 0.0f; // Loop only once @@ -874,7 +964,7 @@ int test_read_image(cl_context context, cl_command_queue queue, // Apple requires its CPU implementation to do // correctly rounded address arithmetic in all // modes - || gDeviceType != CL_DEVICE_TYPE_GPU + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif ) offset = 0.0f; // Loop only once @@ -934,13 +1024,13 @@ int test_read_image(cl_context context, cl_command_queue queue, { err4 = 0.0f; } - float maxErr1 = MAX( + float maxErr1 = std::max( maxErr * maxPixel.p[0], FLT_MIN); - float maxErr2 = MAX( + float maxErr2 = std::max( maxErr * maxPixel.p[1], FLT_MIN); - float maxErr3 = MAX( + float maxErr3 = std::max( maxErr * maxPixel.p[2], FLT_MIN); - float maxErr4 = MAX( + float maxErr4 = std::max( maxErr * maxPixel.p[3], FLT_MIN); if (!(err1 <= maxErr1) @@ -1039,17 +1129,17 @@ int test_read_image(cl_context context, cl_command_queue queue, float err4 = ABS_ERROR(resultPtr[3], expected[3]); float maxErr1 = - MAX(maxErr * maxPixel.p[0], - FLT_MIN); + std::max(maxErr * maxPixel.p[0], + FLT_MIN); float maxErr2 = - MAX(maxErr * maxPixel.p[1], - FLT_MIN); + std::max(maxErr * maxPixel.p[1], + FLT_MIN); float maxErr3 = - MAX(maxErr * maxPixel.p[2], - FLT_MIN); + std::max(maxErr * maxPixel.p[2], + FLT_MIN); float maxErr4 = - MAX(maxErr * maxPixel.p[3], - FLT_MIN); + std::max(maxErr * maxPixel.p[3], + FLT_MIN); if (!(err1 <= maxErr1) @@ -1213,7 +1303,8 @@ int test_read_image(cl_context context, cl_command_queue queue, // offsets (0.0, 0.0) E.g., test one // pixel. if (!imageSampler->normalized_coords - || gDeviceType != CL_DEVICE_TYPE_GPU + || !(gDeviceType + & CL_DEVICE_TYPE_GPU) || NORM_OFFSET == 0) { norm_offset_x = 0.0f; @@ -1395,7 +1486,8 @@ int test_read_image(cl_context context, cl_command_queue queue, // offsets (0.0, 0.0) E.g., test one // pixel. if (!imageSampler->normalized_coords - || gDeviceType != CL_DEVICE_TYPE_GPU + || !(gDeviceType + & CL_DEVICE_TYPE_GPU) || NORM_OFFSET == 0) { norm_offset_x = 0.0f; @@ -1537,10 +1629,51 @@ int test_read_image(cl_context context, cl_command_queue queue, nextLevelOffset += width_lod * height_lod * depth_lod * get_pixel_size(imageInfo->format); width_lod = (width_lod >> 1) ? (width_lod >> 1) : 1; - height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1; - depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1; + if (imageInfo->type != CL_MEM_OBJECT_IMAGE1D_ARRAY) + { + height_lod = (height_lod >> 1) ? (height_lod >> 1) : 1; + } + if (imageInfo->type != CL_MEM_OBJECT_IMAGE2D_ARRAY) + { + depth_lod = (depth_lod >> 1) ? (depth_lod >> 1) : 1; + } } } return numTries != MAX_TRIES || numClamped != MAX_CLAMPED; -}
\ No newline at end of file +} + +void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr) +{ + // mask off the top bit (bit 15) if the image format is (CL_UNORM_SHORT_555, + // CL_RGB). (Note: OpenCL says: the top bit is undefined meaning it can be + // either 0 or 1.) + if (imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555) + { + cl_ushort *temp = (cl_ushort *)resultPtr; + temp[0] &= 0x7fff; + } +} + +int filter_rounding_errors(int forceCorrectlyRoundedWrites, + image_descriptor *imageInfo, float *errors) +{ + // We are allowed 0.6 absolute error vs. infinitely precise for some + // normalized formats + if (0 == forceCorrectlyRoundedWrites + && (imageInfo->format->image_channel_data_type == CL_UNORM_INT8 + || imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 + || imageInfo->format->image_channel_data_type == CL_UNORM_INT16 + || imageInfo->format->image_channel_data_type == CL_SNORM_INT8 + || imageInfo->format->image_channel_data_type == CL_SNORM_INT16 + || imageInfo->format->image_channel_data_type == CL_UNORM_SHORT_555 + || imageInfo->format->image_channel_data_type + == CL_UNORM_SHORT_565)) + { + if (!(fabsf(errors[0]) > 0.6f) && !(fabsf(errors[1]) > 0.6f) + && !(fabsf(errors[2]) > 0.6f) && !(fabsf(errors[3]) > 0.6f)) + return 0; + } + + return 1; +} diff --git a/test_conformance/images/kernel_read_write/test_common.h b/test_conformance/images/kernel_read_write/test_common.h index e7ecbe0b..fc95bee2 100644 --- a/test_conformance/images/kernel_read_write/test_common.h +++ b/test_conformance/images/kernel_read_write/test_common.h @@ -42,12 +42,8 @@ extern int test_read_image(cl_context context, cl_command_queue queue, bool useFloatCoords, ExplicitType outputType, MTdata d); -extern void InitFloatCoordsCommon(image_descriptor *imageInfo, - image_sampler_data *imageSampler, - float *xOffsets, float *yOffsets, - float *zOffsets, float xfract, float yfract, - float zfract, int normalized_coords, MTdata d, - int lod); +extern bool get_image_dimensions(image_descriptor *imageInfo, size_t &width, + size_t &height, size_t &depth); template <class T> int determine_validation_error_offset( @@ -63,8 +59,12 @@ int determine_validation_error_offset( bool clampingErr = false, clamped = false, otherClampingBug = false; int clampedX, clampedY, clampedZ; - size_t imageWidth = imageInfo->width, imageHeight = imageInfo->height, - imageDepth = imageInfo->depth; + size_t imageWidth, imageHeight, imageDepth; + if (get_image_dimensions(imageInfo, imageWidth, imageHeight, imageDepth)) + { + log_error("ERROR: invalid image dimensions"); + return TEST_FAIL; + } clamped = get_integer_coords_offset(x, y, z, xAddressOffset, yAddressOffset, zAddressOffset, imageWidth, imageHeight, @@ -147,85 +147,75 @@ int determine_validation_error_offset( } if (!clampingErr) { - /* if( clamped && ( (int)x + (int)xOffsetValues[ j ] < 0 || - (int)y + (int)yOffsetValues[ j ] < 0 ) ) - { - log_error( "NEGATIVE COORDINATE ERROR\n" ); - return -1; - } - */ - if (true) // gExtraValidateInfo ) + if (printAsFloat) { - if (printAsFloat) - { - log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not " - "validate!\n\tExpected (%g,%g,%g,%g),\n\t got " - "(%g,%g,%g,%g), error of %g\n", - j, x, x, y, y, z, z, (float)expected[0], - (float)expected[1], (float)expected[2], - (float)expected[3], (float)resultPtr[0], - (float)resultPtr[1], (float)resultPtr[2], - (float)resultPtr[3], error); - } - else - { - log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not " - "validate!\n\tExpected (%x,%x,%x,%x),\n\t got " - "(%x,%x,%x,%x)\n", - j, x, x, y, y, z, z, (int)expected[0], - (int)expected[1], (int)expected[2], (int)expected[3], - (int)resultPtr[0], (int)resultPtr[1], - (int)resultPtr[2], (int)resultPtr[3]); - } - log_error( - "Integer coords resolve to %d,%d,%d with img size %d,%d,%d\n", - clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight, - (int)imageDepth); + log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not " + "validate!\n\tExpected (%g,%g,%g,%g),\n\t got " + "(%g,%g,%g,%g), error of %g\n", + j, x, x, y, y, z, z, (float)expected[0], + (float)expected[1], (float)expected[2], + (float)expected[3], (float)resultPtr[0], + (float)resultPtr[1], (float)resultPtr[2], + (float)resultPtr[3], error); + } + else + { + log_error("Sample %ld: coord {%f(%a),%f(%a),%f(%a)} did not " + "validate!\n\tExpected (%x,%x,%x,%x),\n\t got " + "(%x,%x,%x,%x)\n", + j, x, x, y, y, z, z, (int)expected[0], (int)expected[1], + (int)expected[2], (int)expected[3], (int)resultPtr[0], + (int)resultPtr[1], (int)resultPtr[2], (int)resultPtr[3]); + } + log_error( + "Integer coords resolve to %d,%d,%d with img size %d,%d,%d\n", + clampedX, clampedY, clampedZ, (int)imageWidth, (int)imageHeight, + (int)imageDepth); - if (printAsFloat && gExtraValidateInfo) + if (printAsFloat && gExtraValidateInfo) + { + log_error("\nNearby values:\n"); + for (int zOff = -1; zOff <= 1; zOff++) { - log_error("\nNearby values:\n"); - for (int zOff = -1; zOff <= 1; zOff++) + for (int yOff = -1; yOff <= 1; yOff++) { - for (int yOff = -1; yOff <= 1; yOff++) - { - float top[4], real[4], bot[4]; - read_image_pixel_float(imagePtr, imageInfo, - clampedX - 1, clampedY + yOff, - clampedZ + zOff, top); - read_image_pixel_float(imagePtr, imageInfo, clampedX, - clampedY + yOff, clampedZ + zOff, - real); - read_image_pixel_float(imagePtr, imageInfo, - clampedX + 1, clampedY + yOff, - clampedZ + zOff, bot); - log_error("\t(%g,%g,%g,%g)", top[0], top[1], top[2], - top[3]); - log_error(" (%g,%g,%g,%g)", real[0], real[1], real[2], - real[3]); - log_error(" (%g,%g,%g,%g)\n", bot[0], bot[1], bot[2], - bot[3]); - } + float top[4], real[4], bot[4]; + read_image_pixel_float(imagePtr, imageInfo, clampedX - 1, + clampedY + yOff, clampedZ + zOff, + top); + read_image_pixel_float(imagePtr, imageInfo, clampedX, + clampedY + yOff, clampedZ + zOff, + real); + read_image_pixel_float(imagePtr, imageInfo, clampedX + 1, + clampedY + yOff, clampedZ + zOff, + bot); + log_error("\t(%g,%g,%g,%g)", top[0], top[1], top[2], + top[3]); + log_error(" (%g,%g,%g,%g)", real[0], real[1], real[2], + real[3]); + log_error(" (%g,%g,%g,%g)\n", bot[0], bot[1], bot[2], + bot[3]); } } - // } - // else - // log_error( "\n" ); - if (imageSampler->filter_mode != CL_FILTER_LINEAR) - { - if (found) - log_error( - "\tValue really found in image at %d,%d,%d (%s)\n", - actualX, actualY, actualZ, - (found > 1) ? "NOT unique!!" : "unique"); - else - log_error("\tValue not actually found in image\n"); - } - log_error("\n"); } + if (imageSampler->filter_mode != CL_FILTER_LINEAR) + { + if (found) + log_error("\tValue really found in image at %d,%d,%d (%s)\n", + actualX, actualY, actualZ, + (found > 1) ? "NOT unique!!" : "unique"); + else + log_error("\tValue not actually found in image\n"); + } + log_error("\n"); numClamped = -1; // We force the clamped counter to never work if ((--numTries) == 0) return -1; } return 0; } + + +extern int filter_rounding_errors(int forceCorrectlyRoundedWrites, + image_descriptor *imageInfo, float *errors); +extern void filter_undefined_bits(image_descriptor *imageInfo, char *resultPtr); diff --git a/test_conformance/images/kernel_read_write/test_iterations.cpp b/test_conformance/images/kernel_read_write/test_iterations.cpp index 03ca9595..05aed02c 100644 --- a/test_conformance/images/kernel_read_write/test_iterations.cpp +++ b/test_conformance/images/kernel_read_write/test_iterations.cpp @@ -16,6 +16,8 @@ #include "test_common.h" #include <float.h> +#include <algorithm> + #if defined( __APPLE__ ) #include <signal.h> #include <sys/signal.h> @@ -37,24 +39,28 @@ static size_t reduceImageSizeRange(size_t maxDimSize) { } const char *read2DKernelSourcePattern = -"__kernel void sample_kernel( read_only %s input,%s __global float *xOffsets, __global float *yOffsets, __global %s%s *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, imageSampler, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_only %s input,%s __global float " + "*xOffsets, __global float *yOffsets, __global %s%s *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, imageSampler, coords %s);\n" + "}"; const char *read_write2DKernelSourcePattern = -"__kernel void sample_kernel( read_write %s input,%s __global float *xOffsets, __global float *yOffsets, __global %s%s *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_write %s input,%s __global float " + "*xOffsets, __global float *yOffsets, __global %s%s *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, coords %s);\n" + "}"; const char *intCoordKernelSource = " int2 coords = (int2)( xOffsets[offset], yOffsets[offset]);\n"; @@ -413,12 +419,15 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) { @@ -434,7 +443,8 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl float err1 = ABS_ERROR(resultPtr[0], expected[0]); // Clamp to the minimum absolute error for the format if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); // Check if the result matches. if( ! (err1 <= maxErr1) ) @@ -471,7 +481,10 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -484,7 +497,8 @@ int validate_image_2D_depth_results(void *imageValues, void *resultValues, doubl imageSampler, expected, 0, &containsDenormals ); float err1 = ABS_ERROR(resultPtr[0], expected[0]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); if( ! (err1 <= maxErr1) ) @@ -565,12 +579,15 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) { @@ -598,10 +615,14 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; } if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; } if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); // Check if the result matches. if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -650,7 +671,10 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -671,10 +695,14 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form float err2 = ABS_ERROR(resultPtr[1], expected[1]); float err3 = ABS_ERROR(resultPtr[2], expected[2]); float err4 = ABS_ERROR(resultPtr[3], expected[3]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -766,7 +794,10 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -801,7 +832,10 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -862,7 +896,10 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -897,7 +934,10 @@ int validate_image_2D_results(void *imageValues, void *resultValues, double form // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -963,12 +1003,15 @@ int validate_image_2D_sRGB_results(void *imageValues, void *resultValues, double int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) { @@ -1042,7 +1085,10 @@ int validate_image_2D_sRGB_results(void *imageValues, void *resultValues, double // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -1649,16 +1695,18 @@ int test_read_image_set_2D(cl_device_id device, cl_context context, } - sprintf( programSrc, KernelSourcePattern, - (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" : "image2d_t", - samplerArg, get_explicit_type_name( outputType ), + sprintf(programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable" + : "", + (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" + : "image2d_t", + samplerArg, get_explicit_type_name(outputType), (format->image_channel_order == CL_DEPTH) ? "" : "4", - gTestMipmaps?", float lod":" ", - samplerVar, - gTestMipmaps? lodOffsetSource : offsetSource, - floatCoords ? floatKernelSource : intCoordKernelSource, - readFormat, - gTestMipmaps?", lod":" "); + gTestMipmaps ? ", float lod" : " ", samplerVar, + gTestMipmaps ? lodOffsetSource : offsetSource, + floatCoords ? floatKernelSource : intCoordKernelSource, readFormat, + gTestMipmaps ? ", lod" : " "); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_loops.cpp b/test_conformance/images/kernel_read_write/test_loops.cpp index 795a9eda..ea1e1c7c 100644 --- a/test_conformance/images/kernel_read_write/test_loops.cpp +++ b/test_conformance/images/kernel_read_write/test_loops.cpp @@ -84,7 +84,7 @@ int test_read_image_type(cl_device_id device, cl_context context, // of operations for linear filtering on the GPU. We do not test linear // filtering for the CL_RGB CL_UNORM_INT_101010 image format; however, we // test it internally for a set of other image formats. - if ((gDeviceType == CL_DEVICE_TYPE_GPU) + if ((gDeviceType & CL_DEVICE_TYPE_GPU) && (imageSampler->filter_mode == CL_FILTER_LINEAR) && (format->image_channel_order == CL_RGB) && (format->image_channel_data_type == CL_UNORM_INT_101010)) diff --git a/test_conformance/images/kernel_read_write/test_read_1D.cpp b/test_conformance/images/kernel_read_write/test_read_1D.cpp index c9ba4e84..2a722088 100644 --- a/test_conformance/images/kernel_read_write/test_read_1D.cpp +++ b/test_conformance/images/kernel_read_write/test_read_1D.cpp @@ -17,6 +17,8 @@ #include "test_common.h" #include <float.h> +#include <algorithm> + #if defined( __APPLE__ ) #include <signal.h> #include <sys/signal.h> @@ -24,24 +26,28 @@ #endif const char *read1DKernelSourcePattern = -"__kernel void sample_kernel( read_only image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0);\n" -" int offset = tidX;\n" -"%s" -" results[offset] = read_image%s( input, imageSampler, coord %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_only image1d_t input,%s __global float " + "*xOffsets, __global %s4 *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0);\n" + " int offset = tidX;\n" + "%s" + " results[offset] = read_image%s( input, imageSampler, coord %s);\n" + "}"; const char *read_write1DKernelSourcePattern = -"__kernel void sample_kernel( read_write image1d_t input,%s __global float *xOffsets, __global %s4 *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0);\n" -" int offset = tidX;\n" -"%s" -" results[offset] = read_image%s( input, coord %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_write image1d_t input,%s __global float " + "*xOffsets, __global %s4 *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0);\n" + " int offset = tidX;\n" + "%s" + " results[offset] = read_image%s( input, coord %s);\n" + "}"; const char *int1DCoordKernelSource = " int coord = xOffsets[offset];\n"; @@ -485,10 +491,13 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif ) offset = 0.0f; // Loop only once @@ -551,7 +560,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -644,10 +656,13 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif ) offset = 0.0f; // Loop only once @@ -669,10 +684,14 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; } if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; } if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); // Check if the result matches. if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -714,7 +733,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -732,10 +754,14 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke ABS_ERROR(resultPtr[2], expected[2]); float err4 = ABS_ERROR(resultPtr[3], expected[3]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -816,7 +842,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -847,7 +876,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -903,7 +935,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -934,7 +969,10 @@ int test_read_image_1D( cl_context context, cl_command_queue queue, cl_kernel ke // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; checkOnlyOnePixel = 1; } @@ -1041,14 +1079,14 @@ int test_read_image_set_1D(cl_device_id device, cl_context context, { KernelSourcePattern = read1DKernelSourcePattern; } - sprintf( programSrc, - KernelSourcePattern, - samplerArg, get_explicit_type_name( outputType ), - gTestMipmaps ? ", float lod" : "", - samplerVar, + sprintf(programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable" + : "", + samplerArg, get_explicit_type_name(outputType), + gTestMipmaps ? ", float lod" : "", samplerVar, floatCoords ? float1DKernelSource : int1DCoordKernelSource, - readFormat, - gTestMipmaps ? ", lod" : "" ); + readFormat, gTestMipmaps ? ", lod" : ""); ptr = programSrc; diff --git a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp index b3287ded..a8009420 100644 --- a/test_conformance/images/kernel_read_write/test_read_1D_array.cpp +++ b/test_conformance/images/kernel_read_write/test_read_1D_array.cpp @@ -16,32 +16,37 @@ #include "test_common.h" #include <float.h> +#include <algorithm> + #if defined( __APPLE__ ) #include <signal.h> #include <sys/signal.h> #include <setjmp.h> #endif - const char *read1DArrayKernelSourcePattern = -"__kernel void sample_kernel( read_only image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, imageSampler, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_only image1d_array_t input,%s __global " + "float *xOffsets, __global float *yOffsets, __global %s4 *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, imageSampler, coords %s);\n" + "}"; const char *read_write1DArrayKernelSourcePattern = -"__kernel void sample_kernel( read_write image1d_array_t input,%s __global float *xOffsets, __global float *yOffsets, __global %s4 *results %s )\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_write image1d_array_t input,%s __global " + "float *xOffsets, __global float *yOffsets, __global %s4 *results %s )\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, coords %s);\n" + "}"; const char *offset1DArrayKernelSource = " int offset = tidY*get_image_width(input) + tidX;\n"; @@ -577,12 +582,15 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) { @@ -646,7 +654,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -745,12 +756,15 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do correctly + // rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel; norm_offset_x += NORM_OFFSET) { @@ -772,10 +786,14 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; } if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; } if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); // Check if the result matches. if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -819,7 +837,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -838,10 +859,14 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker ABS_ERROR(resultPtr[2], expected[2]); float err4 = ABS_ERROR(resultPtr[3], expected[3]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = + std::max(maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = + std::max(maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = + std::max(maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = + std::max(maxErr * maxPixel.p[3], FLT_MIN); if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || @@ -926,7 +951,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -956,7 +984,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -1012,7 +1043,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -1042,7 +1076,10 @@ int test_read_image_1D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; checkOnlyOnePixel = 1; @@ -1147,15 +1184,15 @@ int test_read_image_set_1D_array(cl_device_id device, cl_context context, KernelSourcePattern = read_write1DArrayKernelSourcePattern; } - sprintf( programSrc, - KernelSourcePattern, - samplerArg, get_explicit_type_name( outputType ), - gTestMipmaps ? ", float lod" : "", - samplerVar, - gTestMipmaps ? offset1DArrayLodKernelSource : offset1DArrayKernelSource, - floatCoords ? floatKernelSource1DArray : intCoordKernelSource1DArray, - readFormat, - gTestMipmaps ? ", lod" : "" ); + sprintf( + programSrc, KernelSourcePattern, + gTestMipmaps ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable" + : "", + samplerArg, get_explicit_type_name(outputType), + gTestMipmaps ? ", float lod" : "", samplerVar, + gTestMipmaps ? offset1DArrayLodKernelSource : offset1DArrayKernelSource, + floatCoords ? floatKernelSource1DArray : intCoordKernelSource1DArray, + readFormat, gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp index 7cb334b2..533a0fe8 100644 --- a/test_conformance/images/kernel_read_write/test_read_2D_array.cpp +++ b/test_conformance/images/kernel_read_write/test_read_2D_array.cpp @@ -16,6 +16,8 @@ #include "test_common.h" #include <float.h> +#include <algorithm> + // Utility function to clamp down image sizes for certain tests to avoid // using too much memory. static size_t reduceImageSizeRange(size_t maxDimSize) { @@ -39,24 +41,32 @@ static size_t reduceImageDepth(size_t maxDepth) { } const char *read2DArrayKernelSourcePattern = -"__kernel void sample_kernel( read_only %s input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets, __global %s%s *results %s )\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, imageSampler, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_only %s input,%s __global float " + "*xOffsets, __global float *yOffsets, __global float *zOffsets, __global " + "%s%s *results %s )\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, imageSampler, coords %s);\n" + "}"; const char *read_write2DArrayKernelSourcePattern = -"__kernel void sample_kernel( read_write %s input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets, __global %s%s *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_write %s input,%s __global float " + "*xOffsets, __global float *yOffsets, __global float *zOffsets, __global " + "%s%s *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, coords %s);\n" + "}"; const char* offset2DarraySource =" int offset = tidZ*get_image_width(input)*get_image_height(input) + tidY*get_image_width(input) + tidX;\n"; const char* offset2DarraySourceLod = @@ -595,12 +605,15 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do + // correctly rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) { @@ -617,7 +630,8 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker ABS_ERROR(resultPtr[0], expected[0]); // Clamp to the minimum absolute error for the format if (err1 > 0 && err1 < formatAbsoluteError) { err1 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); + float maxErr1 = std::max( + maxErr * maxPixel.p[0], FLT_MIN); if( ! (err1 <= maxErr1) ) { @@ -661,7 +675,8 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker float err1 = ABS_ERROR(resultPtr[0], expected[0]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); + float maxErr1 = std::max( + maxErr * maxPixel.p[0], FLT_MIN); if( ! (err1 <= maxErr1) ) @@ -734,12 +749,15 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do + // correctly rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) { @@ -911,12 +929,15 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker int checkOnlyOnePixel = 0; int found_pixel = 0; float offset = NORM_OFFSET; - if (!imageSampler->normalized_coords || imageSampler->filter_mode != CL_FILTER_NEAREST || NORM_OFFSET == 0 + if (!imageSampler->normalized_coords + || imageSampler->filter_mode != CL_FILTER_NEAREST + || NORM_OFFSET == 0 #if defined( __APPLE__ ) - // Apple requires its CPU implementation to do correctly rounded address arithmetic in all modes - || gDeviceType != CL_DEVICE_TYPE_GPU + // Apple requires its CPU implementation to do + // correctly rounded address arithmetic in all modes + || !(gDeviceType & CL_DEVICE_TYPE_GPU) #endif - ) + ) offset = 0.0f; // Loop only once for (float norm_offset_x = -offset; norm_offset_x <= offset && !found_pixel ; norm_offset_x += NORM_OFFSET) { @@ -942,10 +963,14 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker if (err2 > 0 && err2 < formatAbsoluteError) { err2 = 0.0f; } if (err3 > 0 && err3 < formatAbsoluteError) { err3 = 0.0f; } if (err4 > 0 && err4 < formatAbsoluteError) { err4 = 0.0f; } - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = std::max( + maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = std::max( + maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = std::max( + maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = std::max( + maxErr * maxPixel.p[3], FLT_MIN); if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) ) { @@ -1004,10 +1029,14 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker expected[2]); float err4 = ABS_ERROR(resultPtr[3], expected[3]); - float maxErr1 = MAX( maxErr * maxPixel.p[0], FLT_MIN ); - float maxErr2 = MAX( maxErr * maxPixel.p[1], FLT_MIN ); - float maxErr3 = MAX( maxErr * maxPixel.p[2], FLT_MIN ); - float maxErr4 = MAX( maxErr * maxPixel.p[3], FLT_MIN ); + float maxErr1 = std::max( + maxErr * maxPixel.p[0], FLT_MIN); + float maxErr2 = std::max( + maxErr * maxPixel.p[1], FLT_MIN); + float maxErr3 = std::max( + maxErr * maxPixel.p[2], FLT_MIN); + float maxErr4 = std::max( + maxErr * maxPixel.p[3], FLT_MIN); if( ! (err1 <= maxErr1) || ! (err2 <= maxErr2) || ! (err3 <= maxErr3) || ! (err4 <= maxErr4) ) @@ -1096,7 +1125,10 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; norm_offset_z = 0.0f; @@ -1135,7 +1167,11 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType + & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; norm_offset_z = 0.0f; @@ -1204,7 +1240,10 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; norm_offset_z = 0.0f; @@ -1243,7 +1282,11 @@ int test_read_image_2D_array( cl_context context, cl_command_queue queue, cl_ker // If we are not on a GPU, or we are not normalized, then only test with offsets (0.0, 0.0) // E.g., test one pixel. - if (!imageSampler->normalized_coords || gDeviceType != CL_DEVICE_TYPE_GPU || NORM_OFFSET == 0 || NORM_OFFSET == 0 || NORM_OFFSET == 0) { + if (!imageSampler->normalized_coords + || !(gDeviceType + & CL_DEVICE_TYPE_GPU) + || NORM_OFFSET == 0) + { norm_offset_x = 0.0f; norm_offset_y = 0.0f; norm_offset_z = 0.0f; @@ -1377,17 +1420,16 @@ int test_read_image_set_2D_array(cl_device_id device, cl_context context, } // Construct the source - sprintf( programSrc, - KernelSourcePattern, - imageType, - samplerArg, get_explicit_type_name( outputType ), - imageElement, - gTestMipmaps ? ", float lod" : " ", - samplerVar, + sprintf(programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable" + : "", + imageType, samplerArg, get_explicit_type_name(outputType), + imageElement, gTestMipmaps ? ", float lod" : " ", samplerVar, gTestMipmaps ? offset2DarraySourceLod : offset2DarraySource, - floatCoords ? float2DArrayUnnormalizedCoordKernelSource : int2DArrayCoordKernelSource, - readFormat, - gTestMipmaps ? ", lod" : " " ); + floatCoords ? float2DArrayUnnormalizedCoordKernelSource + : int2DArrayCoordKernelSource, + readFormat, gTestMipmaps ? ", lod" : " "); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_read_3D.cpp b/test_conformance/images/kernel_read_write/test_read_3D.cpp index 860114fb..cec77bf0 100644 --- a/test_conformance/images/kernel_read_write/test_read_3D.cpp +++ b/test_conformance/images/kernel_read_write/test_read_3D.cpp @@ -36,24 +36,32 @@ static size_t reduceImageDepth(size_t maxDimSize, RandomSeed& seed) { const char *read3DKernelSourcePattern = -"__kernel void sample_kernel( read_only image3d_t input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets, __global %s4 *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, imageSampler, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_only image3d_t input,%s __global float " + "*xOffsets, __global float *yOffsets, __global float *zOffsets, __global " + "%s4 *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, imageSampler, coords %s);\n" + "}"; const char *read_write3DKernelSourcePattern = -"__kernel void sample_kernel( read_write image3d_t input,%s __global float *xOffsets, __global float *yOffsets, __global float *zOffsets, __global %s4 *results %s)\n" -"{\n" -"%s" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -"%s" -" results[offset] = read_image%s( input, coords %s);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( read_write image3d_t input,%s __global float " + "*xOffsets, __global float *yOffsets, __global float *zOffsets, __global " + "%s4 *results %s)\n" + "{\n" + "%s" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + "%s" + " results[offset] = read_image%s( input, coords %s);\n" + "}"; const char *offset3DKernelSource = " int offset = tidZ*get_image_width(input)*get_image_height(input) + tidY*get_image_width(input) + tidX;\n"; @@ -137,15 +145,16 @@ int test_read_image_set_3D(cl_device_id device, cl_context context, KernelSourcePattern = read_write3DKernelSourcePattern; } - sprintf( programSrc, - KernelSourcePattern, - samplerArg, get_explicit_type_name( outputType ), - gTestMipmaps? ", float lod": " ", - samplerVar, - gTestMipmaps? offset3DLodKernelSource: offset3DKernelSource, - floatCoords ? float3DUnnormalizedCoordKernelSource : int3DCoordKernelSource, - readFormat, - gTestMipmaps? ",lod":" "); + sprintf(programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable" + : "", + samplerArg, get_explicit_type_name(outputType), + gTestMipmaps ? ", float lod" : " ", samplerVar, + gTestMipmaps ? offset3DLodKernelSource : offset3DKernelSource, + floatCoords ? float3DUnnormalizedCoordKernelSource + : int3DCoordKernelSource, + readFormat, gTestMipmaps ? ",lod" : " "); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_write_1D.cpp b/test_conformance/images/kernel_read_write/test_write_1D.cpp index 41983edf..5f726796 100644 --- a/test_conformance/images/kernel_read_write/test_write_1D.cpp +++ b/test_conformance/images/kernel_read_write/test_write_1D.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "../testBase.h" +#include "test_common.h" #if !defined(_WIN32) #include <sys/mman.h> @@ -26,20 +27,24 @@ extern bool validate_float_write_results( float *expected, float *actual, image_ extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor* imageInfo ); const char *readwrite1DKernelSourcePattern = -"__kernel void sample_kernel( __global %s4 *input, read_write image1d_t output %s)\n" -"{\n" -" int tidX = get_global_id(0);\n" -" int offset = tidX;\n" -" write_image%s( output, tidX %s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, read_write image1d_t " + "output %s)\n" + "{\n" + " int tidX = get_global_id(0);\n" + " int offset = tidX;\n" + " write_image%s( output, tidX %s, input[ offset ]);\n" + "}"; const char *write1DKernelSourcePattern = -"__kernel void sample_kernel( __global %s4 *input, write_only image1d_t output %s)\n" -"{\n" -" int tidX = get_global_id(0);\n" -" int offset = tidX;\n" -" write_image%s( output, tidX %s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, write_only image1d_t " + "output %s)\n" + "{\n" + " int tidX = get_global_id(0);\n" + " int offset = tidX;\n" + " write_image%s( output, tidX %s, input[ offset ]);\n" + "}"; int test_write_image_1D( cl_device_id device, cl_context context, cl_command_queue queue, cl_kernel kernel, image_descriptor *imageInfo, ExplicitType inputType, MTdata d ) @@ -395,6 +400,8 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que } else { + filter_undefined_bits(imageInfo, resultPtr); + // Exact result passes every time if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 ) { @@ -403,21 +410,8 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que float errors[4] = {NAN, NAN, NAN, NAN}; pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors ); - // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats - if( 0 == forceCorrectlyRoundedWrites && - ( - imageInfo->format->image_channel_data_type == CL_UNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT16 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT16 - )) - { - if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) && - ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f) ) - failure = 0; - } - + failure = filter_rounding_errors( + forceCorrectlyRoundedWrites, imageInfo, errors); if( failure ) { @@ -458,6 +452,56 @@ int test_write_image_1D( cl_device_id device, cl_context context, cl_command_que log_error( " Actual: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] ); log_error( " Error: %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] ); break; + case CL_UNORM_SHORT_565: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x3F, + (ref_value[0] >> 11) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x3F, + (test_value[0] >> 11) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } + case CL_UNORM_SHORT_555: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x1F, + (ref_value[0] >> 10) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x1F, + (test_value[0] >> 10) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } case CL_UNORM_INT16: case CL_SNORM_INT16: case CL_UNSIGNED_INT16: @@ -574,12 +618,14 @@ int test_write_image_1D_set(cl_device_id device, cl_context context, KernelSourcePattern = readwrite1DKernelSourcePattern; } - sprintf( programSrc, - KernelSourcePattern, - get_explicit_type_name( inputType ), - gTestMipmaps ? ", int lod" : "", - readFormat, - gTestMipmaps ? ", lod" :"" ); + sprintf( + programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma " + "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable" + : "", + get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "", + readFormat, gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp index c771704c..f9024405 100644 --- a/test_conformance/images/kernel_read_write/test_write_1D_array.cpp +++ b/test_conformance/images/kernel_read_write/test_write_1D_array.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "../testBase.h" +#include "test_common.h" #if !defined(_WIN32) #include <sys/mman.h> @@ -26,20 +27,24 @@ extern bool validate_float_write_results( float *expected, float *actual, image_ extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo ); const char *readwrite1DArrayKernelSourcePattern = -"__kernel void sample_kernel( __global %s4 *input, read_write image1d_array_t output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -" write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, read_write " + "image1d_array_t output %s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + " write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ]);\n" + "}"; const char *write1DArrayKernelSourcePattern = -"__kernel void sample_kernel( __global %s4 *input, write_only image1d_array_t output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -" write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, write_only " + "image1d_array_t output %s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + " write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n" + "}"; const char *offset1DArraySource = " int offset = tidY*get_image_width(output) + tidX;\n"; @@ -415,6 +420,9 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma } else { + + filter_undefined_bits(imageInfo, resultPtr); + // Exact result passes every time if( memcmp( resultBuffer, resultPtr, pixelSize ) != 0 ) { @@ -423,21 +431,8 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma float errors[4] = {NAN, NAN, NAN, NAN}; pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors ); - // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats - if( 0 == forceCorrectlyRoundedWrites && - ( - imageInfo->format->image_channel_data_type == CL_UNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT16 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT16 - )) - { - if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) && - ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f) ) - failure = 0; - } - + failure = filter_rounding_errors( + forceCorrectlyRoundedWrites, imageInfo, errors); if( failure ) { @@ -478,6 +473,56 @@ int test_write_image_1D_array( cl_device_id device, cl_context context, cl_comma log_error( " Actual: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] ); log_error( " Error: %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] ); break; + case CL_UNORM_SHORT_565: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x3F, + (ref_value[0] >> 11) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x3F, + (test_value[0] >> 11) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } + case CL_UNORM_SHORT_555: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x1F, + (ref_value[0] >> 10) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x1F, + (test_value[0] >> 10) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } case CL_UNORM_INT16: case CL_SNORM_INT16: case CL_UNSIGNED_INT16: @@ -596,13 +641,15 @@ int test_write_image_1D_array_set(cl_device_id device, cl_context context, } // Construct the source // Construct the source - sprintf( programSrc, - KernelSourcePattern, - get_explicit_type_name( inputType ), - gTestMipmaps ? ", int lod" : "", - gTestMipmaps ? offset1DArrayLodSource : offset1DArraySource, - readFormat, - gTestMipmaps ? ", lod" :"" ); + sprintf( + programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma " + "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable" + : "", + get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "", + gTestMipmaps ? offset1DArrayLodSource : offset1DArraySource, readFormat, + gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp index 08a7a803..c1c56994 100644 --- a/test_conformance/images/kernel_read_write/test_write_2D_array.cpp +++ b/test_conformance/images/kernel_read_write/test_write_2D_array.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "../testBase.h" +#include "test_common.h" #if !defined(_WIN32) #include <sys/mman.h> @@ -48,20 +49,28 @@ static size_t reduceImageDepth(size_t maxDepth) { } const char *write2DArrayKernelSourcePattern = -"__kernel void sample_kernel( __global %s%s *input, write_only %s output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -" write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s%s *input, write_only %s output " + "%s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + " write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset " + "]);\n" + "}"; const char *readwrite2DArrayKernelSourcePattern = -"__kernel void sample_kernel( __global %s%s *input, read_write %s output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -" write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ] );\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s%s *input, read_write %s output " + "%s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + " write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset " + "] );\n" + "}"; const char *offset2DArrayKernelSource = " int offset = tidZ*get_image_width(output)*get_image_height(output) + tidY*get_image_width(output) + tidX;\n"; @@ -438,6 +447,9 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma } else { + + filter_undefined_bits(imageInfo, resultPtr); + // Exact result passes every time if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 ) { @@ -446,21 +458,9 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma float errors[4] = {NAN, NAN, NAN, NAN}; pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors ); - // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats - if( 0 == forceCorrectlyRoundedWrites && - ( - imageInfo->format->image_channel_data_type == CL_UNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT16 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT16 - )) - { - if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) && - ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f) ) - failure = 0; - } - + failure = filter_rounding_errors( + forceCorrectlyRoundedWrites, imageInfo, + errors); if( failure ) { @@ -501,6 +501,64 @@ int test_write_image_2D_array( cl_device_id device, cl_context context, cl_comma log_error( " Actual: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] ); log_error( " Error: %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] ); break; + case CL_UNORM_SHORT_565: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x " + "Actual: 0x%2.2x \n", + ref_value[0], + test_value[0]); + + log_error( + " Expected: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x3F, + (ref_value[0] >> 11) & 0x1F); + log_error( + " Actual: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x3F, + (test_value[0] >> 11) & 0x1F); + log_error( + " Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } + case CL_UNORM_SHORT_555: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x " + "Actual: 0x%2.2x \n", + ref_value[0], + test_value[0]); + + log_error( + " Expected: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x1F, + (ref_value[0] >> 10) & 0x1F); + log_error( + " Actual: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x1F, + (test_value[0] >> 10) & 0x1F); + log_error( + " Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } case CL_UNORM_INT16: case CL_SNORM_INT16: case CL_UNSIGNED_INT16: @@ -621,15 +679,19 @@ int test_write_image_2D_array_set(cl_device_id device, cl_context context, } // Construct the source // Construct the source - sprintf( programSrc, - KernelSourcePattern, - get_explicit_type_name( inputType ), - (format->image_channel_order == CL_DEPTH) ? "" : "4", - (format->image_channel_order == CL_DEPTH) ? "image2d_array_depth_t" : "image2d_array_t", - gTestMipmaps ? " , int lod" : "", - gTestMipmaps ? offset2DArrayLodKernelSource : offset2DArrayKernelSource, - readFormat, - gTestMipmaps ? ", lod" : "" ); + sprintf( + programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma " + "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable" + : "", + get_explicit_type_name(inputType), + (format->image_channel_order == CL_DEPTH) ? "" : "4", + (format->image_channel_order == CL_DEPTH) ? "image2d_array_depth_t" + : "image2d_array_t", + gTestMipmaps ? " , int lod" : "", + gTestMipmaps ? offset2DArrayLodKernelSource : offset2DArrayKernelSource, + readFormat, gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_write_3D.cpp b/test_conformance/images/kernel_read_write/test_write_3D.cpp index 5cc96bb4..9da93695 100644 --- a/test_conformance/images/kernel_read_write/test_write_3D.cpp +++ b/test_conformance/images/kernel_read_write/test_write_3D.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "../testBase.h" +#include "test_common.h" #if !defined(_WIN32) #include <sys/mman.h> @@ -45,22 +46,30 @@ static size_t reduceImageDepth(size_t maxDimSize, MTdata& seed) { const char *write3DKernelSourcePattern = -"%s" -"__kernel void sample_kernel( __global %s4 *input, write_only image3d_t output %s )\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -" write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n" -"}"; + "%s" + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, write_only image3d_t " + "output %s )\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + " write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset " + "]);\n" + "}"; const char *readwrite3DKernelSourcePattern = -"%s" -"__kernel void sample_kernel( __global %s4 *input, read_write image3d_t output %s )\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = get_global_id(2);\n" -"%s" -" write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset ]);\n" -"}"; + "%s" + "%s\n" + "__kernel void sample_kernel( __global %s4 *input, read_write image3d_t " + "output %s )\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1), tidZ = " + "get_global_id(2);\n" + "%s" + " write_image%s( output, (int4)( tidX, tidY, tidZ, 0 ) %s, input[ offset " + "]);\n" + "}"; const char *khr3DWritesPragma = "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; @@ -445,6 +454,9 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que } else { + + filter_undefined_bits(imageInfo, resultPtr); + // Exact result passes every time if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 ) { @@ -453,21 +465,9 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que float errors[4] = {NAN, NAN, NAN, NAN}; pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors ); - // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats - if( 0 == forceCorrectlyRoundedWrites && - ( - imageInfo->format->image_channel_data_type == CL_UNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT16 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT16 - )) - { - if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) && - ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f) ) - failure = 0; - } - + failure = filter_rounding_errors( + forceCorrectlyRoundedWrites, imageInfo, + errors); if( failure ) { @@ -508,6 +508,64 @@ int test_write_image_3D( cl_device_id device, cl_context context, cl_command_que log_error( " Actual: 0x%2.2x 0x%2.2x 0x%2.2x 0x%2.2x\n", ((cl_uchar*)resultPtr)[0], ((cl_uchar*)resultPtr)[1], ((cl_uchar*)resultPtr)[2], ((cl_uchar*)resultPtr)[3] ); log_error( " Error: %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] ); break; + case CL_UNORM_SHORT_565: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x " + "Actual: 0x%2.2x \n", + ref_value[0], + test_value[0]); + + log_error( + " Expected: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x3F, + (ref_value[0] >> 11) & 0x1F); + log_error( + " Actual: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x3F, + (test_value[0] >> 11) & 0x1F); + log_error( + " Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } + case CL_UNORM_SHORT_555: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x " + "Actual: 0x%2.2x \n", + ref_value[0], + test_value[0]); + + log_error( + " Expected: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x1F, + (ref_value[0] >> 10) & 0x1F); + log_error( + " Actual: 0x%2.2x 0x%2.2x " + "0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x1F, + (test_value[0] >> 10) & 0x1F); + log_error( + " Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } case CL_UNORM_INT16: case CL_SNORM_INT16: case CL_UNSIGNED_INT16: @@ -628,14 +686,15 @@ int test_write_image_3D_set(cl_device_id device, cl_context context, } // Construct the source - sprintf( programSrc, - KernelSourcePattern, - gTestMipmaps ? "" : khr3DWritesPragma, - get_explicit_type_name( inputType ), - gTestMipmaps ? ", int lod" : "", - gTestMipmaps ? offset3DLodSource : offset3DSource, - readFormat, - gTestMipmaps ? ", lod" : "" ); + sprintf( + programSrc, KernelSourcePattern, khr3DWritesPragma, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma " + "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable" + : "", + get_explicit_type_name(inputType), gTestMipmaps ? ", int lod" : "", + gTestMipmaps ? offset3DLodSource : offset3DSource, readFormat, + gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/kernel_read_write/test_write_image.cpp b/test_conformance/images/kernel_read_write/test_write_image.cpp index e40e80d6..29626971 100644 --- a/test_conformance/images/kernel_read_write/test_write_image.cpp +++ b/test_conformance/images/kernel_read_write/test_write_image.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "../testBase.h" +#include "test_common.h" #if !defined(_WIN32) #include <sys/mman.h> @@ -46,20 +47,24 @@ extern bool validate_float_write_results( float *expected, float *actual, image_ extern bool validate_half_write_results( cl_half *expected, cl_half *actual, image_descriptor *imageInfo ); const char *writeKernelSourcePattern = -"__kernel void sample_kernel( __global %s%s *input, write_only %s output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -" write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s%s *input, write_only %s output " + "%s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + " write_image%s( output, (int2)( tidX, tidY ) %s, input[ offset ]);\n" + "}"; const char *read_writeKernelSourcePattern = -"__kernel void sample_kernel( __global %s%s *input, read_write %s output %s)\n" -"{\n" -" int tidX = get_global_id(0), tidY = get_global_id(1);\n" -"%s" -" write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ] );\n" -"}"; + "%s\n" + "__kernel void sample_kernel( __global %s%s *input, read_write %s output " + "%s)\n" + "{\n" + " int tidX = get_global_id(0), tidY = get_global_id(1);\n" + "%s" + " write_image%s( output, (int2)( tidX, tidY )%s, input[ offset ] );\n" + "}"; const char *offset2DKernelSource = " int offset = tidY*get_image_width(output) + tidX;\n"; @@ -477,6 +482,9 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue } else { + + filter_undefined_bits(imageInfo, resultPtr); + // Exact result passes every time if( memcmp( resultBuffer, resultPtr, get_pixel_size( imageInfo->format ) ) != 0 ) { @@ -485,21 +493,8 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue float errors[4] = {NAN, NAN, NAN, NAN}; pack_image_pixel_error( (float *)imagePtr, imageInfo->format, resultBuffer, errors ); - // We are allowed 0.6 absolute error vs. infinitely precise for some normalized formats - if( 0 == forceCorrectlyRoundedWrites && - ( - imageInfo->format->image_channel_data_type == CL_UNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT_101010 || - imageInfo->format->image_channel_data_type == CL_UNORM_INT16 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT8 || - imageInfo->format->image_channel_data_type == CL_SNORM_INT16 - )) - { - if( ! (fabsf( errors[0] ) > 0.6f) && ! (fabsf( errors[1] ) > 0.6f) && - ! (fabsf( errors[2] ) > 0.6f) && ! (fabsf( errors[3] ) > 0.6f) ) - failure = 0; - } - + failure = filter_rounding_errors( + forceCorrectlyRoundedWrites, imageInfo, errors); if( failure ) { @@ -577,6 +572,57 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue log_error( " Actual: %a %a %a %a\n", ((cl_float*)resultPtr)[0], ((cl_float*)resultPtr)[1], ((cl_float*)resultPtr)[2], ((cl_float*)resultPtr)[3] ); log_error( " Ulps: %f %f %f %f\n", errors[0], errors[1], errors[2], errors[3] ); break; + case CL_UNORM_SHORT_565: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x3F, + (ref_value[0] >> 11) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x3F, + (test_value[0] >> 11) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } + + case CL_UNORM_SHORT_555: { + cl_uint *ref_value = + (cl_uint *)resultBuffer; + cl_uint *test_value = + (cl_uint *)resultPtr; + + log_error(" Expected: 0x%2.2x Actual: " + "0x%2.2x \n", + ref_value[0], test_value[0]); + + log_error(" Expected: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + ref_value[0] & 0x1F, + (ref_value[0] >> 5) & 0x1F, + (ref_value[0] >> 10) & 0x1F); + log_error(" Actual: 0x%2.2x " + "0x%2.2x 0x%2.2x \n", + test_value[0] & 0x1F, + (test_value[0] >> 5) & 0x1F, + (test_value[0] >> 10) & 0x1F); + log_error(" Error: %f %f %f %f\n", + errors[0], errors[1], + errors[2]); + break; + } } float *v = (float *)(char *)imagePtr; @@ -686,15 +732,19 @@ int test_write_image_set(cl_device_id device, cl_context context, } // Construct the source - sprintf( programSrc, - KernelSourcePattern, - get_explicit_type_name( inputType ), - (format->image_channel_order == CL_DEPTH) ? "" : "4", - (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" : "image2d_t", - gTestMipmaps ? ", int lod" : "", - gTestMipmaps ? offset2DLodKernelSource : offset2DKernelSource, - readFormat, - gTestMipmaps ? ", lod" : "" ); + sprintf( + programSrc, KernelSourcePattern, + gTestMipmaps + ? "#pragma OPENCL EXTENSION cl_khr_mipmap_image: enable\n#pragma " + "OPENCL EXTENSION cl_khr_mipmap_image_writes: enable" + : "", + get_explicit_type_name(inputType), + (format->image_channel_order == CL_DEPTH) ? "" : "4", + (format->image_channel_order == CL_DEPTH) ? "image2d_depth_t" + : "image2d_t", + gTestMipmaps ? ", int lod" : "", + gTestMipmaps ? offset2DLodKernelSource : offset2DKernelSource, + readFormat, gTestMipmaps ? ", lod" : ""); ptr = programSrc; error = create_single_kernel_helper(context, &program, &kernel, 1, &ptr, diff --git a/test_conformance/images/samplerlessReads/test_iterations.cpp b/test_conformance/images/samplerlessReads/test_iterations.cpp index 55eaaf48..e2f89aad 100644 --- a/test_conformance/images/samplerlessReads/test_iterations.cpp +++ b/test_conformance/images/samplerlessReads/test_iterations.cpp @@ -215,6 +215,7 @@ int test_read_image_set_2D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // Determine types diff --git a/test_conformance/images/samplerlessReads/test_read_1D.cpp b/test_conformance/images/samplerlessReads/test_read_1D.cpp index aa261b7e..6ed9910a 100644 --- a/test_conformance/images/samplerlessReads/test_read_1D.cpp +++ b/test_conformance/images/samplerlessReads/test_read_1D.cpp @@ -215,6 +215,7 @@ int test_read_image_set_1D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // Determine types diff --git a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp index fb0c2632..677eb9f1 100644 --- a/test_conformance/images/samplerlessReads/test_read_1D_array.cpp +++ b/test_conformance/images/samplerlessReads/test_read_1D_array.cpp @@ -214,6 +214,7 @@ int test_read_image_set_1D_array(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // Determine types diff --git a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp index 7a3084d3..c3a991a7 100644 --- a/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp +++ b/test_conformance/images/samplerlessReads/test_read_1D_buffer.cpp @@ -219,6 +219,7 @@ int test_read_image_set_1D_buffer(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // note: image_buffer test uses image1D for results validation. diff --git a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp index 99f24266..8273f538 100644 --- a/test_conformance/images/samplerlessReads/test_read_2D_array.cpp +++ b/test_conformance/images/samplerlessReads/test_read_2D_array.cpp @@ -202,6 +202,7 @@ int test_read_image_set_2D_array(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // Determine types diff --git a/test_conformance/images/samplerlessReads/test_read_3D.cpp b/test_conformance/images/samplerlessReads/test_read_3D.cpp index cf411407..0df46c86 100644 --- a/test_conformance/images/samplerlessReads/test_read_3D.cpp +++ b/test_conformance/images/samplerlessReads/test_read_3D.cpp @@ -206,6 +206,7 @@ int test_read_image_set_3D(cl_device_id device, cl_context context, if (memSize > (cl_ulong)SIZE_MAX) { memSize = (cl_ulong)SIZE_MAX; + maxAllocSize = (cl_ulong)SIZE_MAX; } // Determine types diff --git a/test_conformance/integer_ops/CMakeLists.txt b/test_conformance/integer_ops/CMakeLists.txt index a045ef81..5344eabc 100644 --- a/test_conformance/integer_ops/CMakeLists.txt +++ b/test_conformance/integer_ops/CMakeLists.txt @@ -11,6 +11,7 @@ set(${MODULE_NAME}_SOURCES test_unary_ops.cpp verification_and_generation_functions.cpp test_popcount.cpp + test_integer_dot_product.cpp ) include(../CMakeCommon.txt) diff --git a/test_conformance/integer_ops/main.cpp b/test_conformance/integer_ops/main.cpp index 00e91661..e57cffd9 100644 --- a/test_conformance/integer_ops/main.cpp +++ b/test_conformance/integer_ops/main.cpp @@ -25,127 +25,129 @@ #endif test_definition test_list[] = { - ADD_TEST( integer_clz ), - ADD_TEST_VERSION( integer_ctz, Version(2, 0)), - ADD_TEST( integer_hadd ), - ADD_TEST( integer_rhadd ), - ADD_TEST( integer_mul_hi ), - ADD_TEST( integer_rotate ), - ADD_TEST( integer_clamp ), - ADD_TEST( integer_mad_sat ), - ADD_TEST( integer_mad_hi ), - ADD_TEST( integer_min ), - ADD_TEST( integer_max ), - ADD_TEST( integer_upsample ), - - ADD_TEST( integer_abs ), - ADD_TEST( integer_abs_diff ), - ADD_TEST( integer_add_sat ), - ADD_TEST( integer_sub_sat ), - - ADD_TEST( integer_addAssign ), - ADD_TEST( integer_subtractAssign ), - ADD_TEST( integer_multiplyAssign ), - ADD_TEST( integer_divideAssign ), - ADD_TEST( integer_moduloAssign ), - ADD_TEST( integer_andAssign ), - ADD_TEST( integer_orAssign ), - ADD_TEST( integer_exclusiveOrAssign ), - - ADD_TEST( unary_ops_increment ), - ADD_TEST( unary_ops_decrement ), - ADD_TEST( unary_ops_full ), - - ADD_TEST( integer_mul24 ), - ADD_TEST( integer_mad24 ), - - ADD_TEST( long_math ), - ADD_TEST( long_logic ), - ADD_TEST( long_shift ), - ADD_TEST( long_compare ), - - ADD_TEST( ulong_math ), - ADD_TEST( ulong_logic ), - ADD_TEST( ulong_shift ), - ADD_TEST( ulong_compare ), - - ADD_TEST( int_math ), - ADD_TEST( int_logic ), - ADD_TEST( int_shift ), - ADD_TEST( int_compare ), - - ADD_TEST( uint_math ), - ADD_TEST( uint_logic ), - ADD_TEST( uint_shift ), - ADD_TEST( uint_compare ), - - ADD_TEST( short_math ), - ADD_TEST( short_logic ), - ADD_TEST( short_shift ), - ADD_TEST( short_compare ), - - ADD_TEST( ushort_math ), - ADD_TEST( ushort_logic ), - ADD_TEST( ushort_shift ), - ADD_TEST( ushort_compare ), - - ADD_TEST( char_math ), - ADD_TEST( char_logic ), - ADD_TEST( char_shift ), - ADD_TEST( char_compare ), - - ADD_TEST( uchar_math ), - ADD_TEST( uchar_logic ), - ADD_TEST( uchar_shift ), - ADD_TEST( uchar_compare ), - - ADD_TEST( popcount ), + ADD_TEST(integer_clz), + ADD_TEST_VERSION(integer_ctz, Version(2, 0)), + ADD_TEST(integer_hadd), + ADD_TEST(integer_rhadd), + ADD_TEST(integer_mul_hi), + ADD_TEST(integer_rotate), + ADD_TEST(integer_clamp), + ADD_TEST(integer_mad_sat), + ADD_TEST(integer_mad_hi), + ADD_TEST(integer_min), + ADD_TEST(integer_max), + ADD_TEST(integer_upsample), + + ADD_TEST(integer_abs), + ADD_TEST(integer_abs_diff), + ADD_TEST(integer_add_sat), + ADD_TEST(integer_sub_sat), + + ADD_TEST(integer_addAssign), + ADD_TEST(integer_subtractAssign), + ADD_TEST(integer_multiplyAssign), + ADD_TEST(integer_divideAssign), + ADD_TEST(integer_moduloAssign), + ADD_TEST(integer_andAssign), + ADD_TEST(integer_orAssign), + ADD_TEST(integer_exclusiveOrAssign), + + ADD_TEST(unary_ops_increment), + ADD_TEST(unary_ops_decrement), + ADD_TEST(unary_ops_full), + + ADD_TEST(integer_mul24), + ADD_TEST(integer_mad24), + + ADD_TEST(long_math), + ADD_TEST(long_logic), + ADD_TEST(long_shift), + ADD_TEST(long_compare), + + ADD_TEST(ulong_math), + ADD_TEST(ulong_logic), + ADD_TEST(ulong_shift), + ADD_TEST(ulong_compare), + + ADD_TEST(int_math), + ADD_TEST(int_logic), + ADD_TEST(int_shift), + ADD_TEST(int_compare), + + ADD_TEST(uint_math), + ADD_TEST(uint_logic), + ADD_TEST(uint_shift), + ADD_TEST(uint_compare), + + ADD_TEST(short_math), + ADD_TEST(short_logic), + ADD_TEST(short_shift), + ADD_TEST(short_compare), + + ADD_TEST(ushort_math), + ADD_TEST(ushort_logic), + ADD_TEST(ushort_shift), + ADD_TEST(ushort_compare), + + ADD_TEST(char_math), + ADD_TEST(char_logic), + ADD_TEST(char_shift), + ADD_TEST(char_compare), + + ADD_TEST(uchar_math), + ADD_TEST(uchar_logic), + ADD_TEST(uchar_shift), + ADD_TEST(uchar_compare), + + ADD_TEST(popcount), // Quick - ADD_TEST( quick_long_math ), - ADD_TEST( quick_long_logic ), - ADD_TEST( quick_long_shift ), - ADD_TEST( quick_long_compare ), - - ADD_TEST( quick_ulong_math ), - ADD_TEST( quick_ulong_logic ), - ADD_TEST( quick_ulong_shift ), - ADD_TEST( quick_ulong_compare ), - - ADD_TEST( quick_int_math ), - ADD_TEST( quick_int_logic ), - ADD_TEST( quick_int_shift ), - ADD_TEST( quick_int_compare ), - - ADD_TEST( quick_uint_math ), - ADD_TEST( quick_uint_logic ), - ADD_TEST( quick_uint_shift ), - ADD_TEST( quick_uint_compare ), - - ADD_TEST( quick_short_math ), - ADD_TEST( quick_short_logic ), - ADD_TEST( quick_short_shift ), - ADD_TEST( quick_short_compare ), - - ADD_TEST( quick_ushort_math ), - ADD_TEST( quick_ushort_logic ), - ADD_TEST( quick_ushort_shift ), - ADD_TEST( quick_ushort_compare ), - - ADD_TEST( quick_char_math ), - ADD_TEST( quick_char_logic ), - ADD_TEST( quick_char_shift ), - ADD_TEST( quick_char_compare ), - - ADD_TEST( quick_uchar_math ), - ADD_TEST( quick_uchar_logic ), - ADD_TEST( quick_uchar_shift ), - ADD_TEST( quick_uchar_compare ), - - ADD_TEST( vector_scalar ), + ADD_TEST(quick_long_math), + ADD_TEST(quick_long_logic), + ADD_TEST(quick_long_shift), + ADD_TEST(quick_long_compare), + + ADD_TEST(quick_ulong_math), + ADD_TEST(quick_ulong_logic), + ADD_TEST(quick_ulong_shift), + ADD_TEST(quick_ulong_compare), + + ADD_TEST(quick_int_math), + ADD_TEST(quick_int_logic), + ADD_TEST(quick_int_shift), + ADD_TEST(quick_int_compare), + + ADD_TEST(quick_uint_math), + ADD_TEST(quick_uint_logic), + ADD_TEST(quick_uint_shift), + ADD_TEST(quick_uint_compare), + + ADD_TEST(quick_short_math), + ADD_TEST(quick_short_logic), + ADD_TEST(quick_short_shift), + ADD_TEST(quick_short_compare), + + ADD_TEST(quick_ushort_math), + ADD_TEST(quick_ushort_logic), + ADD_TEST(quick_ushort_shift), + ADD_TEST(quick_ushort_compare), + + ADD_TEST(quick_char_math), + ADD_TEST(quick_char_logic), + ADD_TEST(quick_char_shift), + ADD_TEST(quick_char_compare), + + ADD_TEST(quick_uchar_math), + ADD_TEST(quick_uchar_logic), + ADD_TEST(quick_uchar_shift), + ADD_TEST(quick_uchar_compare), + + ADD_TEST(vector_scalar), + + ADD_TEST(integer_dot_product), }; -const int test_num = ARRAY_SIZE( test_list ); +const int test_num = ARRAY_SIZE(test_list); void fill_test_values( cl_long *outBufferA, cl_long *outBufferB, size_t numElements, MTdata d ) { diff --git a/test_conformance/integer_ops/procs.h b/test_conformance/integer_ops/procs.h index d5b77e70..82311fb9 100644 --- a/test_conformance/integer_ops/procs.h +++ b/test_conformance/integer_ops/procs.h @@ -141,3 +141,5 @@ extern int test_unary_ops_decrement(cl_device_id deviceID, cl_context context, c extern int test_vector_scalar(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); +extern int test_integer_dot_product(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements); diff --git a/test_conformance/integer_ops/test_add_sat.cpp b/test_conformance/integer_ops/test_add_sat.cpp index c0e45d11..e33f5c67 100644 --- a/test_conformance/integer_ops/test_add_sat.cpp +++ b/test_conformance/integer_ops/test_add_sat.cpp @@ -21,18 +21,9 @@ #include <sys/types.h> #include <sys/stat.h> -#include "procs.h" - -#define UCHAR_MIN 0 -#define USHRT_MIN 0 -#define UINT_MIN 0 +#include <algorithm> -#ifndef MAX -#define MAX( _a, _b ) ( (_a) > (_b) ? (_a) : (_b) ) -#endif -#ifndef MIN -#define MIN( _a, _b ) ( (_a) < (_b) ? (_a) : (_b) ) -#endif +#include "procs.h" static int verify_addsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize ) { @@ -40,8 +31,8 @@ static int verify_addsat_char( const cl_char *inA, const cl_char *inB, const cl_ for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] + (cl_int) inB[i]; - r = MAX( r, CL_CHAR_MIN ); - r = MIN( r, CL_CHAR_MAX ); + r = std::max(r, CL_CHAR_MIN); + r = std::min(r, CL_CHAR_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for add_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } @@ -55,9 +46,9 @@ static int verify_addsat_uchar( const cl_uchar *inA, const cl_uchar *inB, const for( i = 0; i < n; i++ ) { cl_int r = (int) inA[i] + (int) inB[i]; - r = MAX( r, 0 ); - r = MIN( r, CL_UCHAR_MAX ); - if( r != outptr[i] ) + r = std::max(r, 0); + r = std::min(r, CL_UCHAR_MAX); + if (r != outptr[i]) { log_info( "\n%d) Failure for add_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } } return 0; @@ -69,8 +60,8 @@ static int verify_addsat_short( const cl_short *inA, const cl_short *inB, const for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] + (cl_int) inB[i]; - r = MAX( r, CL_SHRT_MIN ); - r = MIN( r, CL_SHRT_MAX ); + r = std::max(r, CL_SHRT_MIN); + r = std::min(r, CL_SHRT_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for add_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } @@ -84,8 +75,8 @@ static int verify_addsat_ushort( const cl_ushort *inA, const cl_ushort *inB, con for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] + (cl_int) inB[i]; - r = MAX( r, 0 ); - r = MIN( r, CL_USHRT_MAX ); + r = std::max(r, 0); + r = std::min(r, CL_USHRT_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for add_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } diff --git a/test_conformance/integer_ops/test_integer_dot_product.cpp b/test_conformance/integer_ops/test_integer_dot_product.cpp new file mode 100644 index 00000000..602d59b6 --- /dev/null +++ b/test_conformance/integer_ops/test_integer_dot_product.cpp @@ -0,0 +1,442 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include <algorithm> +#include <limits> +#include <numeric> +#include <string> +#include <vector> + +#include "procs.h" +#include "harness/integer_ops_test_info.h" +#include "harness/testHarness.h" + +template <size_t N, typename DstType, typename SrcTypeA, typename SrcTypeB> +static void +calculate_reference(std::vector<DstType>& ref, const std::vector<SrcTypeA>& a, + const std::vector<SrcTypeB>& b, const bool AccSat = false, + const std::vector<DstType>& acc = {}) +{ + assert(a.size() == b.size()); + assert(AccSat == false || acc.size() == a.size() / N); + + ref.resize(a.size() / N); + for (size_t r = 0; r < ref.size(); r++) + { + cl_long result = AccSat ? acc[r] : 0; + for (size_t c = 0; c < N; c++) + { + // OK to assume no overflow? + result += a[r * N + c] * b[r * N + c]; + } + if (AccSat && result > std::numeric_limits<DstType>::max()) + { + result = std::numeric_limits<DstType>::max(); + } + ref[r] = static_cast<DstType>(result); + } +} + +template <typename SrcTypeA, typename SrcTypeB> +void generate_inputs_with_special_values(std::vector<SrcTypeA>& a, + std::vector<SrcTypeB>& b) +{ + const std::vector<SrcTypeA> specialValuesA( + { static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min()), + static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() + 1), + static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::min() / 2), 0, + static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() / 2), + static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max() - 1), + static_cast<SrcTypeA>(std::numeric_limits<SrcTypeA>::max()) }); + const std::vector<SrcTypeB> specialValuesB( + { static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min()), + static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() + 1), + static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::min() / 2), 0, + static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() / 2), + static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max() - 1), + static_cast<SrcTypeB>(std::numeric_limits<SrcTypeB>::max()) }); + + size_t count = 0; + for (auto svA : specialValuesA) + { + for (auto svB : specialValuesB) + { + a[count] = svA; + b[count] = svB; + ++count; + } + } + + // Generate random data for the rest of the inputs: + MTdataHolder d(gRandomSeed); + generate_random_data(TestInfo<SrcTypeA>::explicitType, a.size() - count, d, + a.data() + count); + generate_random_data(TestInfo<SrcTypeB>::explicitType, b.size() - count, d, + b.data() + count); +} + +template <typename SrcType> +void generate_acc_sat_inputs(std::vector<SrcType>& acc) +{ + // First generate random data: + fill_vector_with_random_data(acc); + + // Now go through the generated data, and make every other element large. + // This ensures we have some elements that need saturation. + for (size_t i = 0; i < acc.size(); i += 2) + { + acc[i] = std::numeric_limits<SrcType>::max() - acc[i]; + } +} + +template <typename T> struct PackedTestInfo +{ + static constexpr const char* deviceTypeName = "UNSUPPORTED"; +}; +template <> struct PackedTestInfo<cl_char> +{ + static constexpr const char* deviceTypeName = "int"; +}; +template <> struct PackedTestInfo<cl_uchar> +{ + static constexpr const char* deviceTypeName = "uint"; +}; + +static constexpr const char* kernel_source_dot = R"CLC( +__kernel void test_dot(__global DSTTYPE* dst, __global SRCTYPEA* a, __global SRCTYPEB* b) +{ + int index = get_global_id(0); + dst[index] = DOT(a[index], b[index]); +} +)CLC"; + +static constexpr const char* kernel_source_dot_acc_sat = R"CLC( +__kernel void test_dot_acc_sat( + __global DSTTYPE* dst, + __global SRCTYPEA* a, __global SRCTYPEB* b, __global DSTTYPE* acc) +{ + int index = get_global_id(0); + dst[index] = DOT_ACC_SAT(a[index], b[index], acc[index]); +} +)CLC"; + +template <typename DstType, typename SrcTypeA, typename SrcTypeB, size_t N> +static int test_case_dot(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements, bool packed, + bool sat) +{ + log_info(" testing %s = dot%s%s(%s, %s)\n", + std::numeric_limits<DstType>::is_signed ? "signed" : "unsigned", + sat ? "_acc_sat" : "", packed ? "_packed" : "", + std::numeric_limits<SrcTypeA>::is_signed ? "signed" : "unsigned", + std::numeric_limits<SrcTypeB>::is_signed ? "signed" : "unsigned"); + + cl_int error = CL_SUCCESS; + + clProgramWrapper program; + clKernelWrapper kernel; + + std::string buildOptions; + buildOptions += " -DDSTTYPE="; + buildOptions += TestInfo<DstType>::deviceTypeName; + buildOptions += " -DSRCTYPEA="; + buildOptions += packed + ? PackedTestInfo<SrcTypeA>::deviceTypeName + : TestInfo<SrcTypeA>::deviceTypeName + std::to_string(N); + buildOptions += " -DSRCTYPEB="; + buildOptions += packed + ? PackedTestInfo<SrcTypeB>::deviceTypeName + : TestInfo<SrcTypeB>::deviceTypeName + std::to_string(N); + std::string packedSuffix; + packedSuffix += std::numeric_limits<SrcTypeA>::is_signed ? "s" : "u"; + packedSuffix += std::numeric_limits<SrcTypeB>::is_signed ? "s" : "u"; + packedSuffix += std::numeric_limits<DstType>::is_signed ? "_int" : "_uint"; + if (sat) + { + buildOptions += packed + ? " -DDOT_ACC_SAT=dot_acc_sat_4x8packed_" + packedSuffix + : " -DDOT_ACC_SAT=dot_acc_sat"; + } + else + { + buildOptions += + packed ? " -DDOT=dot_4x8packed_" + packedSuffix : " -DDOT=dot"; + } + + std::vector<SrcTypeA> a(N * num_elements); + std::vector<SrcTypeB> b(N * num_elements); + generate_inputs_with_special_values(a, b); + + std::vector<DstType> acc; + if (sat) + { + acc.resize(num_elements); + generate_acc_sat_inputs(acc); + } + + std::vector<DstType> reference(num_elements); + calculate_reference<N>(reference, a, b, sat, acc); + + const char* source = sat ? kernel_source_dot_acc_sat : kernel_source_dot; + const char* name = sat ? "test_dot_acc_sat" : "test_dot"; + error = create_single_kernel_helper(context, &program, &kernel, 1, &source, + name, buildOptions.c_str()); + test_error(error, "Unable to create test kernel"); + + clMemWrapper dst = clCreateBuffer( + context, 0, reference.size() * sizeof(DstType), NULL, &error); + test_error(error, "Unable to create output buffer"); + + clMemWrapper srcA = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + a.size() * sizeof(SrcTypeA), a.data(), &error); + test_error(error, "Unable to create srcA buffer"); + + clMemWrapper srcB = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + b.size() * sizeof(SrcTypeB), b.data(), &error); + test_error(error, "Unable to create srcB buffer"); + + clMemWrapper srcAcc; + if (sat) + { + srcAcc = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + acc.size() * sizeof(DstType), acc.data(), &error); + test_error(error, "Unable to create acc buffer"); + } + + error = clSetKernelArg(kernel, 0, sizeof(dst), &dst); + test_error(error, "Unable to set output buffer kernel arg"); + + error = clSetKernelArg(kernel, 1, sizeof(srcA), &srcA); + test_error(error, "Unable to set srcA buffer kernel arg"); + + error = clSetKernelArg(kernel, 2, sizeof(srcB), &srcB); + test_error(error, "Unable to set srcB buffer kernel arg"); + + if (sat) + { + error = clSetKernelArg(kernel, 3, sizeof(srcAcc), &srcAcc); + test_error(error, "Unable to set acc buffer kernel arg"); + } + + size_t global_work_size[] = { reference.size() }; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, + NULL, 0, NULL, NULL); + test_error(error, "Unable to enqueue test kernel"); + + error = clFinish(queue); + test_error(error, "clFinish failed after test kernel"); + + std::vector<DstType> results(reference.size(), 99); + error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0, + results.size() * sizeof(DstType), + results.data(), 0, NULL, NULL); + test_error(error, "Unable to read data after test kernel"); + + if (results != reference) + { + log_error("Result buffer did not match reference buffer!\n"); + return TEST_FAIL; + } + + return TEST_PASS; +} + +template <typename SrcType, typename DstType, size_t N> +static int test_vectype(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + int result = TEST_PASS; + + typedef typename std::make_signed<SrcType>::type SSrcType; + typedef typename std::make_signed<DstType>::type SDstType; + + typedef typename std::make_unsigned<SrcType>::type USrcType; + typedef typename std::make_unsigned<DstType>::type UDstType; + + // dot testing: + result |= test_case_dot<UDstType, USrcType, USrcType, N>( + deviceID, context, queue, num_elements, false, false); + result |= test_case_dot<SDstType, SSrcType, SSrcType, N>( + deviceID, context, queue, num_elements, false, false); + result |= test_case_dot<SDstType, USrcType, SSrcType, N>( + deviceID, context, queue, num_elements, false, false); + result |= test_case_dot<SDstType, SSrcType, USrcType, N>( + deviceID, context, queue, num_elements, false, false); + + // dot_acc_sat testing: + result |= test_case_dot<UDstType, USrcType, USrcType, N>( + deviceID, context, queue, num_elements, false, true); + result |= test_case_dot<SDstType, SSrcType, SSrcType, N>( + deviceID, context, queue, num_elements, false, true); + result |= test_case_dot<SDstType, USrcType, SSrcType, N>( + deviceID, context, queue, num_elements, false, true); + result |= test_case_dot<SDstType, SSrcType, USrcType, N>( + deviceID, context, queue, num_elements, false, true); + + return result; +} + +template <typename SrcType, typename DstType, size_t N> +static int test_vectype_packed(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + int result = TEST_PASS; + + typedef typename std::make_signed<SrcType>::type SSrcType; + typedef typename std::make_signed<DstType>::type SDstType; + + typedef typename std::make_unsigned<SrcType>::type USrcType; + typedef typename std::make_unsigned<DstType>::type UDstType; + + // packed dot testing: + result |= test_case_dot<UDstType, USrcType, USrcType, N>( + deviceID, context, queue, num_elements, true, false); + result |= test_case_dot<SDstType, SSrcType, SSrcType, N>( + deviceID, context, queue, num_elements, true, false); + result |= test_case_dot<SDstType, USrcType, SSrcType, N>( + deviceID, context, queue, num_elements, true, false); + result |= test_case_dot<SDstType, SSrcType, USrcType, N>( + deviceID, context, queue, num_elements, true, false); + + // packed dot_acc_sat testing: + result |= test_case_dot<UDstType, USrcType, USrcType, N>( + deviceID, context, queue, num_elements, true, true); + result |= test_case_dot<SDstType, SSrcType, SSrcType, N>( + deviceID, context, queue, num_elements, true, true); + result |= test_case_dot<SDstType, USrcType, SSrcType, N>( + deviceID, context, queue, num_elements, true, true); + result |= test_case_dot<SDstType, SSrcType, USrcType, N>( + deviceID, context, queue, num_elements, true, true); + + return result; +} + +int test_integer_dot_product(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + if (!is_extension_available(deviceID, "cl_khr_integer_dot_product")) + { + log_info("cl_khr_integer_dot_product is not supported\n"); + return TEST_SKIPPED_ITSELF; + } + + Version deviceVersion = get_device_cl_version(deviceID); + cl_version extensionVersion; + + if ((deviceVersion >= Version(3, 0)) + || is_extension_available(deviceID, "cl_khr_extended_versioning")) + { + extensionVersion = + get_extension_version(deviceID, "cl_khr_integer_dot_product"); + } + else + { + // Assume 1.0.0 is supported if the version can't be queried + extensionVersion = CL_MAKE_VERSION(1, 0, 0); + } + + cl_int error = CL_SUCCESS; + int result = TEST_PASS; + + cl_device_integer_dot_product_capabilities_khr dotCaps = 0; + error = clGetDeviceInfo(deviceID, + CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, + sizeof(dotCaps), &dotCaps, NULL); + test_error( + error, + "Unable to query CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR"); + + // Check that the required capabilities are reported + test_assert_error( + dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR, + "When cl_khr_integer_dot_product is supported " + "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR must be " + "supported"); + + if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0)) + { + test_assert_error( + dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR, + "When cl_khr_integer_dot_product is supported with version >= 2.0.0" + "CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR must be " + "supported"); + } + + // Check that acceleration properties can be queried + if (extensionVersion >= CL_MAKE_VERSION(2, 0, 0)) + { + size_t size_ret; + error = clGetDeviceInfo( + deviceID, + CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, 0, + nullptr, &size_ret); + test_error( + error, + "Unable to query size of data returned by " + "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR"); + + cl_device_integer_dot_product_acceleration_properties_khr + accelerationProperties; + error = clGetDeviceInfo( + deviceID, + CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, + sizeof(accelerationProperties), &accelerationProperties, nullptr); + test_error(error, "Unable to query 8-bit acceleration properties"); + + error = clGetDeviceInfo( + deviceID, + CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, + 0, nullptr, &size_ret); + test_error( + error, + "Unable to query size of data returned by " + "CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_" + "PACKED_KHR"); + + error = clGetDeviceInfo( + deviceID, + CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, + sizeof(accelerationProperties), &accelerationProperties, nullptr); + test_error(error, + "Unable to query 4x8-bit packed acceleration properties"); + } + + // Report when unknown capabilities are found + if (dotCaps + & ~(CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR + | CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR)) + { + log_info("NOTE: found an unknown / untested capability!\n"); + } + + // Test built-in functions + if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR) + { + result |= test_vectype<cl_uchar, cl_uint, 4>(deviceID, context, queue, + num_elements); + } + + if (dotCaps & CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR) + { + result |= test_vectype_packed<cl_uchar, cl_uint, 4>( + deviceID, context, queue, num_elements); + } + + return result; +} diff --git a/test_conformance/integer_ops/test_integers.cpp b/test_conformance/integer_ops/test_integers.cpp index 8d77b24b..6fa18e1e 100644 --- a/test_conformance/integer_ops/test_integers.cpp +++ b/test_conformance/integer_ops/test_integers.cpp @@ -16,14 +16,9 @@ #include "testBase.h" #include "harness/conversions.h" -#define TEST_SIZE 512 +#include <algorithm> -#ifndef MIN - #define MIN( _a, _b ) ((_a) < (_b) ? (_a) : (_b)) -#endif -#ifndef MAX - #define MAX( _a, _b ) ((_a) > (_b) ? (_a) : (_b)) -#endif +#define TEST_SIZE 512 const char *singleParamIntegerKernelSourcePattern = "__kernel void sample_test(__global %s *sourceA, __global %s *destValues)\n" @@ -1512,19 +1507,20 @@ bool verify_integer_clamp( void *sourceA, void *sourceB, void *sourceC, void *de switch( vecAType ) { case kULong: - ((cl_ulong*) destination)[0] = MAX(MIN(valueA, valueC), valueB); + ((cl_ulong *)destination)[0] = + std::max(std::min(valueA, valueC), valueB); break; case kUInt: - ((cl_uint*) destination)[0] = (cl_uint) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_uint *)destination)[0] = + (cl_uint)(std::max(std::min(valueA, valueC), valueB)); break; case kUShort: - ((cl_ushort*) destination)[0] = (cl_ushort) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_ushort *)destination)[0] = + (cl_ushort)(std::max(std::min(valueA, valueC), valueB)); break; case kUChar: - ((cl_uchar*) destination)[0] = (cl_uchar) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_uchar *)destination)[0] = + (cl_uchar)(std::max(std::min(valueA, valueC), valueB)); break; default: //error -- should never get here @@ -1576,19 +1572,20 @@ bool verify_integer_clamp( void *sourceA, void *sourceB, void *sourceC, void *de switch( vecAType ) { case kLong: - ((cl_long*) destination)[0] = MAX(MIN(valueA, valueC), valueB); + ((cl_long *)destination)[0] = + std::max(std::min(valueA, valueC), valueB); break; case kInt: - ((cl_int*) destination)[0] = (cl_int) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_int *)destination)[0] = + (cl_int)(std::max(std::min(valueA, valueC), valueB)); break; case kShort: - ((cl_short*) destination)[0] = (cl_short) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_short *)destination)[0] = + (cl_short)(std::max(std::min(valueA, valueC), valueB)); break; case kChar: - ((cl_char*) destination)[0] = (cl_char) - (MAX(MIN(valueA, valueC), valueB)); + ((cl_char *)destination)[0] = + (cl_char)(std::max(std::min(valueA, valueC), valueB)); break; default: //error -- should never get here @@ -1654,13 +1651,16 @@ bool verify_integer_mad_sat( void *sourceA, void *sourceB, void *sourceC, void * ((cl_ulong*) destination)[0] = multLo; break; case kUInt: - ((cl_uint*) destination)[0] = (cl_uint) MIN( multLo, (cl_ulong) CL_UINT_MAX ); + ((cl_uint *)destination)[0] = + (cl_uint)std::min(multLo, (cl_ulong)CL_UINT_MAX); break; case kUShort: - ((cl_ushort*) destination)[0] = (cl_ushort) MIN( multLo, (cl_ulong) CL_USHRT_MAX ); + ((cl_ushort *)destination)[0] = + (cl_ushort)std::min(multLo, (cl_ulong)CL_USHRT_MAX); break; case kUChar: - ((cl_uchar*) destination)[0] = (cl_uchar) MIN( multLo, (cl_ulong) CL_UCHAR_MAX ); + ((cl_uchar *)destination)[0] = + (cl_uchar)std::min(multLo, (cl_ulong)CL_UCHAR_MAX); break; default: //error -- should never get here @@ -1744,18 +1744,18 @@ bool verify_integer_mad_sat( void *sourceA, void *sourceB, void *sourceC, void * ((cl_long*) destination)[0] = result; break; case kInt: - result = MIN( result, (cl_long) CL_INT_MAX ); - result = MAX( result, (cl_long) CL_INT_MIN ); + result = std::min(result, (cl_long)CL_INT_MAX); + result = std::max(result, (cl_long)CL_INT_MIN); ((cl_int*) destination)[0] = (cl_int) result; break; case kShort: - result = MIN( result, (cl_long) CL_SHRT_MAX ); - result = MAX( result, (cl_long) CL_SHRT_MIN ); + result = std::min(result, (cl_long)CL_SHRT_MAX); + result = std::max(result, (cl_long)CL_SHRT_MIN); ((cl_short*) destination)[0] = (cl_short) result; break; case kChar: - result = MIN( result, (cl_long) CL_CHAR_MAX ); - result = MAX( result, (cl_long) CL_CHAR_MIN ); + result = std::min(result, (cl_long)CL_CHAR_MAX); + result = std::max(result, (cl_long)CL_CHAR_MIN); ((cl_char*) destination)[0] = (cl_char) result; break; default: diff --git a/test_conformance/integer_ops/test_sub_sat.cpp b/test_conformance/integer_ops/test_sub_sat.cpp index 845d1064..2a88ee0d 100644 --- a/test_conformance/integer_ops/test_sub_sat.cpp +++ b/test_conformance/integer_ops/test_sub_sat.cpp @@ -21,19 +21,9 @@ #include <sys/types.h> #include <sys/stat.h> -#include "procs.h" - -#define UCHAR_MIN 0 -#define USHRT_MIN 0 -#define UINT_MIN 0 - -#ifndef MAX -#define MAX( _a, _b ) ( (_a) > (_b) ? (_a) : (_b) ) -#endif -#ifndef MIN -#define MIN( _a, _b ) ( (_a) < (_b) ? (_a) : (_b) ) -#endif +#include <algorithm> +#include "procs.h" static int verify_subsat_char( const cl_char *inA, const cl_char *inB, const cl_char *outptr, int n, const char *sizeName, int vecSize ) { @@ -41,8 +31,8 @@ static int verify_subsat_char( const cl_char *inA, const cl_char *inB, const cl_ for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] - (cl_int) inB[i]; - r = MAX( r, CL_CHAR_MIN ); - r = MIN( r, CL_CHAR_MAX ); + r = std::max(r, CL_CHAR_MIN); + r = std::min(r, CL_CHAR_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for sub_sat( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } @@ -56,9 +46,9 @@ static int verify_subsat_uchar( const cl_uchar *inA, const cl_uchar *inB, const for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] - (cl_int) inB[i]; - r = MAX( r, 0 ); - r = MIN( r, CL_UCHAR_MAX ); - if( r != outptr[i] ) + r = std::max(r, 0); + r = std::min(r, CL_UCHAR_MAX); + if (r != outptr[i]) { log_info( "\n%d) Failure for sub_sat( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } } return 0; @@ -70,8 +60,8 @@ static int verify_subsat_short( const cl_short *inA, const cl_short *inB, const for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] - (cl_int) inB[i]; - r = MAX( r, CL_SHRT_MIN ); - r = MIN( r, CL_SHRT_MAX ); + r = std::max(r, CL_SHRT_MIN); + r = std::min(r, CL_SHRT_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for sub_sat( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } @@ -85,8 +75,8 @@ static int verify_subsat_ushort( const cl_ushort *inA, const cl_ushort *inB, con for( i = 0; i < n; i++ ) { cl_int r = (cl_int) inA[i] - (cl_int) inB[i]; - r = MAX( r, 0 ); - r = MIN( r, CL_USHRT_MAX ); + r = std::max(r, 0); + r = std::min(r, CL_USHRT_MAX); if( r != outptr[i] ) { log_info( "\n%d) Failure for sub_sat( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; } diff --git a/test_conformance/integer_ops/test_unary_ops.cpp b/test_conformance/integer_ops/test_unary_ops.cpp index 72940eaa..c91c85ae 100644 --- a/test_conformance/integer_ops/test_unary_ops.cpp +++ b/test_conformance/integer_ops/test_unary_ops.cpp @@ -107,7 +107,7 @@ int test_unary_op( cl_command_queue queue, cl_context context, OpKonstants which // For sub ops, the min control value is 2. Otherwise, it's 0 controlData[ i ] |= 0x02; else if( whichOp == kIncrement ) - // For addition ops, the MAX control value is 1. Otherwise, it's 3 + // For addition ops, the max control value is 1. Otherwise, it's 3 controlData[ i ] &= ~0x02; } streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index d8dfc403..32814026 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -9,7 +9,10 @@ set(${MODULE_NAME}_SOURCES binary_operator_float.cpp binary_two_results_i_double.cpp binary_two_results_i_float.cpp + common.cpp + common.h function_list.cpp + function_list.h i_unary_double.cpp i_unary_float.cpp macro_binary_double.cpp @@ -20,9 +23,12 @@ set(${MODULE_NAME}_SOURCES mad_float.cpp main.cpp reference_math.cpp + reference_math.h sleep.cpp + sleep.h ternary_double.cpp ternary_float.cpp + test_functions.h unary_double.cpp unary_float.cpp unary_two_results_double.cpp @@ -32,6 +38,11 @@ set(${MODULE_NAME}_SOURCES unary_u_double.cpp unary_u_float.cpp utility.cpp + utility.h ) +# math_brute_force compiles cleanly with -Wall (except for a few remaining +# warnings), but other tests not (yet); so enable -Wall locally. +set_gnulike_module_compile_flags("-Wall -Wno-strict-aliasing -Wno-unknown-pragmas") + include(../CMakeCommon.txt) diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp index 4baa4991..f18d0b97 100644 --- a/test_conformance/math_brute_force/binary_double.cpp +++ b/test_conformance/math_brute_force/binary_double.cpp @@ -14,16 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> +namespace { + const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022); -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -109,49 +112,49 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. double maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -164,10 +167,10 @@ typedef struct TestInfo int isNextafter; bool relaxedMode; // True if test is running in relaxed mode, false // otherwise. -} TestInfo; +}; // A table of more difficult cases to get right -static const double specialValues[] = { +const double specialValues[] = { -NAN, -INFINITY, -DBL_MAX, @@ -277,204 +280,20 @@ static const double specialValues[] = { +0.0, }; -static size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - - test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); - test_info.skipNanInf = 0; - test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); float ulps = job->ulps; dptr func = job->f->dfunc; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; MTdata d = tinfo->d; cl_int error; const char *name = job->f->name; @@ -577,7 +396,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -659,7 +479,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) float err = Bruteforce_Ulp_Error_Double(test, correct); int fail = !(fabsf(err) <= ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, ulps)) @@ -810,7 +630,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -825,3 +645,152 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = 0; + test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/binary_float.cpp b/test_conformance/math_brute_force/binary_float.cpp index 32caafa3..fe1491d7 100644 --- a/test_conformance/math_brute_force/binary_float.cpp +++ b/test_conformance/math_brute_force/binary_float.cpp @@ -14,16 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> +namespace { + const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126); -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -107,49 +110,49 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. double maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -162,10 +165,10 @@ typedef struct TestInfo int isNextafter; bool relaxedMode; // True if test is running in relaxed mode, false // otherwise. -} TestInfo; +}; // A table of more difficult cases to get right -static const float specialValues[] = { +const float specialValues[] = { -NAN, -INFINITY, -FLT_MAX, @@ -267,209 +270,23 @@ static const float specialValues[] = { +0.0f, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - test_info.relaxedMode = relaxedMode; - test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); - test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; - test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; int ftz = job->ftz; bool relaxedMode = job->relaxedMode; float ulps = getAllowedUlpError(job->f, relaxedMode); MTdata d = tinfo->d; cl_int error; - cl_uchar *overflow = (cl_uchar *)malloc(buffer_size); + std::vector<bool> overflow(buffer_elements, false); const char *name = job->f->name; int isFDim = job->isFDim; int skipNanInf = job->skipNanInf; @@ -583,7 +400,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -631,7 +449,6 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) vlog_error("Error: clFinish failed! err: %d\n", error); goto exit; } - free(overflow); return CL_SUCCESS; } @@ -641,7 +458,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { // Calculate the correctly rounded reference result memset(&oldMode, 0, sizeof(oldMode)); - if (ftz) ForceFTZ(&oldMode); + if (ftz || relaxedMode) ForceFTZ(&oldMode); // Set the rounding mode to match the device if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); @@ -726,7 +543,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) float err = Ulp_Error(test, correct); int fail = !(fabsf(err) <= ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, ulps)) @@ -938,7 +755,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a " - "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %d\n", + "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %zu\n", name, sizeNames[k], err, s[j], ((cl_uint *)s)[j], s2[j], ((cl_uint *)s2)[j], r[j], test, ((cl_uint *)&test)[0], j); @@ -970,7 +787,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -983,6 +800,154 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) } exit: - if (overflow) free(overflow); + return error; +} + +} // anonymous namespace + +int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + return error; } diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp index 69e620aa..f8786e68 100644 --- a/test_conformance/math_brute_force/binary_i_double.cpp +++ b/test_conformance/math_brute_force/binary_i_double.cpp @@ -14,6 +14,7 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" @@ -21,8 +22,10 @@ #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -108,61 +111,63 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. cl_int maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values float ulps; // max_allowed ulps int ftz; // non-zero if running in flush to zero mode + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. // no special values -} TestInfo; +}; // A table of more difficult cases to get right -static const double specialValues[] = { +const double specialValues[] = { -NAN, -INFINITY, -DBL_MAX, @@ -272,210 +277,28 @@ static const double specialValues[] = { +0.0, }; -static size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static const int specialValuesInt[] = { +const int specialValuesInt[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX, }; -static constexpr size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - cl_buffer_region region2 = { i * test_info.subBufferSize - * sizeof(cl_int), - test_info.subBufferSize * sizeof(cl_int) }; - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} +constexpr size_t specialValuesIntCount = + sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); float ulps = job->ulps; dptr func = job->f->dfunc; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; MTdata d = tinfo->d; cl_int error; const char *name = job->f->name; @@ -576,7 +399,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -658,7 +482,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) float err = Bruteforce_Ulp_Error_Double(test, correct); int fail = !(fabsf(err) <= ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, ulps)) @@ -744,3 +568,151 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp index e65a9aaf..d855f447 100644 --- a/test_conformance/math_brute_force/binary_i_float.cpp +++ b/test_conformance/math_brute_force/binary_i_float.cpp @@ -14,6 +14,7 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" @@ -21,8 +22,10 @@ #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -106,61 +109,62 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. cl_int maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values float ulps; // max_allowed ulps int ftz; // non-zero if running in flush to zero mode - + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. // no special values -} TestInfo; +}; // A table of more difficult cases to get right -static const float specialValues[] = { +const float specialValues[] = { -NAN, -INFINITY, -FLT_MAX, @@ -262,212 +266,29 @@ static const float specialValues[] = { +0.0f, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static const int specialValuesInt[] = { +const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, 128, 0x02000001, 0x04000001, 1465264071, 1488522147, -1, -2, -3, -126, -127, -128, -0x02000001, -0x04000001, -1465264071, -1488522147, }; -static size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - cl_buffer_region region2 = { i * test_info.subBufferSize - * sizeof(cl_int), - test_info.subBufferSize * sizeof(cl_int) }; - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} +constexpr size_t specialValuesIntCount = + sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; float ulps = job->ulps; MTdata d = tinfo->d; cl_int error; @@ -568,7 +389,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -650,7 +472,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) float err = Ulp_Error(test, correct); int fail = !(fabsf(err) <= ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, ulps)) @@ -694,7 +516,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error( "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: " - "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n", + "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %zu\n", name, sizeNames[k], err, s[j], ((uint32_t *)s)[j], s2[j], r[j], ((uint32_t *)r)[j], test, ((cl_uint *)&test)[0], j); @@ -723,7 +545,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -738,3 +560,152 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp index 21e76c85..bbe5c438 100644 --- a/test_conformance/math_brute_force/binary_operator_double.cpp +++ b/test_conformance/math_brute_force/binary_operator_double.cpp @@ -14,15 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *operator_symbol, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) +namespace { + +int BuildKernel(const char *operator_symbol, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -108,49 +111,49 @@ static int BuildKernel(const char *operator_symbol, int vectorSize, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *operator_symbol; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->operator_symbol, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. double maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -161,10 +164,10 @@ typedef struct TestInfo // otherwise. // no special fields -} TestInfo; +}; // A table of more difficult cases to get right -static const double specialValues[] = { +const double specialValues[] = { -NAN, -INFINITY, -DBL_MAX, @@ -274,201 +277,20 @@ static const double specialValues[] = { +0.0, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, - bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); float ulps = job->ulps; dptr func = job->f->dfunc; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; MTdata d = tinfo->d; cl_int error; const char *name = job->f->name; @@ -569,7 +391,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -651,7 +474,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) float err = Bruteforce_Ulp_Error_Double(test, correct); int fail = !(fabsf(err) <= ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, ulps)) @@ -778,7 +601,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -793,3 +616,148 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d, + bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp index ccaef604..1a28d8d8 100644 --- a/test_conformance/math_brute_force/binary_operator_float.cpp +++ b/test_conformance/math_brute_force/binary_operator_float.cpp @@ -14,15 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *operator_symbol, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) +namespace { + +int BuildKernel(const char *operator_symbol, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -106,49 +109,49 @@ static int BuildKernel(const char *operator_symbol, int vectorSize, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *operator_symbol; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->operator_symbol, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. double maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; -typedef struct TestInfo +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -159,10 +162,10 @@ typedef struct TestInfo // otherwise. // no special fields -} TestInfo; +}; // A table of more difficult cases to get right -static const float specialValues[] = { +const float specialValues[] = { -NAN, -INFINITY, -FLT_MAX, @@ -264,207 +267,23 @@ static const float specialValues[] = { +0.0f, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, - bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - test_info.relaxedMode = relaxedMode; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; int ftz = job->ftz; bool relaxedMode = job->relaxedMode; float ulps = getAllowedUlpError(job->f, relaxedMode); MTdata d = tinfo->d; cl_int error; - cl_uchar *overflow = (cl_uchar *)malloc(buffer_size); + std::vector<bool> overflow(buffer_elements, false); const char *name = job->f->name; cl_uint *t = 0; cl_float *r = 0; @@ -584,7 +403,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -627,14 +447,13 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gSkipCorrectnessTesting) { - free(overflow); return CL_SUCCESS; } // Calculate the correctly rounded reference result FPU_mode_type oldMode; memset(&oldMode, 0, sizeof(oldMode)); - if (ftz) ForceFTZ(&oldMode); + if (ftz || relaxedMode) ForceFTZ(&oldMode); // Set the rounding mode to match the device oldRoundMode = kRoundToNearestEven; @@ -662,7 +481,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); - if (ftz) RestoreFPState(&oldMode); + if (ftz || relaxedMode) RestoreFPState(&oldMode); // Read the data back -- no need to wait for the first N-1 buffers but wait // for the last buffer. This is an in order queue. @@ -719,7 +538,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps))); if (fabsf(errB) < fabsf(err)) err = errB; - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, ulps)) @@ -879,7 +698,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a " - "vs. %a (0x%8.8x) at index: %d\n", + "vs. %a (0x%8.8x) at index: %zu\n", name, sizeNames[k], err, s[j], s2[j], r[j], test, ((cl_uint *)&test)[0], j); error = -1; @@ -907,7 +726,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -920,6 +739,152 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) } exit: - if (overflow) free(overflow); + return error; +} + +} // anonymous namespace + +int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d, + bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + return error; } diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp index 14f41092..bbfd707b 100644 --- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp +++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp @@ -14,15 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -115,24 +119,23 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } -typedef struct ComputeReferenceInfoD_ +struct ComputeReferenceInfoD { const double *x; const double *y; @@ -141,9 +144,9 @@ typedef struct ComputeReferenceInfoD_ long double (*f_ffpI)(long double, long double, int *); cl_uint lim; cl_uint count; -} ComputeReferenceInfoD; +}; -static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) +cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo; cl_uint lim = cri->lim; @@ -165,10 +168,12 @@ static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo) return CL_SUCCESS; } +} // anonymous namespace + int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; @@ -187,8 +192,8 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -375,7 +380,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) if (iptrUndefined) iErr = 0; int fail = !(fabsf(err) <= f->double_ulps && iErr == 0); - if (ftz && fail) + if ((ftz || relaxedMode) && fail) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, f->double_ulps)) @@ -523,17 +528,20 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) if (fail) { - vlog_error( - "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, " - "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, " - "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ " - "0x%16.16llx, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, ((double *)gIn)[j], - ((double *)gIn2)[j], ((cl_ulong *)gIn)[j], - ((cl_ulong *)gIn2)[j], ((double *)gOut_Ref)[j], - ((int *)gOut_Ref2)[j], ((cl_ulong *)gOut_Ref)[j], - ((cl_uint *)gOut_Ref2)[j], test, q2[j], - ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]); + vlog_error("\nERROR: %sD%s: {%f, %" PRId64 + "} ulp error at {%.13la, " + "%.13la} ({ 0x%16.16" PRIx64 ", 0x%16.16" PRIx64 + "}): *{%.13la, " + "%d} ({ 0x%16.16" PRIx64 + ", 0x%8.8x}) vs. {%.13la, %d} ({ " + "0x%16.16" PRIx64 ", 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, + ((double *)gIn)[j], ((double *)gIn2)[j], + ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j], + ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j], + ((cl_ulong *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]); error = -1; goto exit; } @@ -544,8 +552,9 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -562,8 +571,8 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode) else vlog("passed"); - vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal, - maxErrorVal2); + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); } vlog("\n"); @@ -573,7 +582,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp index 5ef44b6e..07473376 100644 --- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp +++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp @@ -14,15 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -113,24 +117,23 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } -typedef struct ComputeReferenceInfoF_ +struct ComputeReferenceInfoF { const float *x; const float *y; @@ -139,9 +142,9 @@ typedef struct ComputeReferenceInfoF_ double (*f_ffpI)(double, double, int *); cl_uint lim; cl_uint count; -} ComputeReferenceInfoF; +}; -static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) { ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; cl_uint lim = cri->lim; @@ -161,13 +164,15 @@ static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) return CL_SUCCESS; } +} // anonymous namespace + int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) { int error; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); @@ -188,8 +193,8 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -375,7 +380,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) if (iptrUndefined) iErr = 0; int fail = !(fabsf(err) <= float_ulps && iErr == 0); - if (ftz && fail) + if ((ftz || relaxedMode) && fail) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, float_ulps)) @@ -509,16 +514,17 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) if (fail) { - vlog_error( - "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} " - "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, " - "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", - f->name, sizeNames[k], err, iErr, ((float *)gIn)[j], - ((float *)gIn2)[j], ((cl_uint *)gIn)[j], - ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j], - ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j], - ((cl_uint *)gOut_Ref2)[j], test, q2[j], - ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]); + vlog_error("\nERROR: %s%s: {%f, %" PRId64 + "} ulp error at {%a, %a} " + "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, " + "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, + ((float *)gIn)[j], ((float *)gIn2)[j], + ((cl_uint *)gIn)[j], ((cl_uint *)gIn2)[j], + ((float *)gOut_Ref)[j], ((int *)gOut_Ref2)[j], + ((cl_uint *)gOut_Ref)[j], + ((cl_uint *)gOut_Ref2)[j], test, q2[j], + ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]); error = -1; goto exit; } @@ -529,8 +535,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -547,8 +554,8 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode) else vlog("passed"); - vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal, - maxErrorVal2); + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); } vlog("\n"); @@ -558,7 +565,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp new file mode 100644 index 00000000..f5e9f993 --- /dev/null +++ b/test_conformance/math_brute_force/common.cpp @@ -0,0 +1,170 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" + +#include "utility.h" // for sizeNames and sizeValues. + +#include <sstream> +#include <string> + +namespace { + +const char *GetTypeName(ParameterType type) +{ + switch (type) + { + case ParameterType::Float: return "float"; + case ParameterType::Double: return "double"; + } + return nullptr; +} + +const char *GetUndefValue(ParameterType type) +{ + switch (type) + { + case ParameterType::Float: + case ParameterType::Double: return "NAN"; + } + return nullptr; +} + +void EmitDefineType(std::ostringstream &kernel, const char *name, + ParameterType type, int vector_size_index) +{ + kernel << "#define " << name << " " << GetTypeName(type) + << sizeNames[vector_size_index] << '\n'; + kernel << "#define " << name << "_SCALAR " << GetTypeName(type) << '\n'; +} + +void EmitDefineUndef(std::ostringstream &kernel, const char *name, + ParameterType type) +{ + kernel << "#define " << name << " " << GetUndefValue(type) << '\n'; +} + +void EmitEnableExtension(std::ostringstream &kernel, ParameterType type) +{ + switch (type) + { + case ParameterType::Double: + kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + break; + + case ParameterType::Float: + // No extension required. + break; + } +} + +} // anonymous namespace + +std::string GetKernelName(int vector_size_index) +{ + return std::string("math_kernel") + sizeNames[vector_size_index]; +} + +std::string GetTernaryKernel(const std::string &kernel_name, + const char *builtin, ParameterType retType, + ParameterType type1, ParameterType type2, + ParameterType type3, int vector_size_index) +{ + // To keep the kernel code readable, use macros for types and undef values. + std::ostringstream kernel; + EmitDefineType(kernel, "RETTYPE", retType, vector_size_index); + EmitDefineType(kernel, "TYPE1", type1, vector_size_index); + EmitDefineType(kernel, "TYPE2", type2, vector_size_index); + EmitDefineType(kernel, "TYPE3", type3, vector_size_index); + EmitDefineUndef(kernel, "UNDEF1", type1); + EmitDefineUndef(kernel, "UNDEF2", type2); + EmitDefineUndef(kernel, "UNDEF3", type3); + EmitEnableExtension(kernel, type1); + + // clang-format off + const char *kernel_nonvec3[] = { R"( +__kernel void )", kernel_name.c_str(), R"((__global RETTYPE* out, + __global TYPE1* in1, + __global TYPE2* in2, + __global TYPE3* in3) +{ + size_t i = get_global_id(0); + out[i] = )", builtin, R"((in1[i], in2[i], in3[i]); +} +)" }; + + const char *kernel_vec3[] = { R"( +__kernel void )", kernel_name.c_str(), R"((__global RETTYPE_SCALAR* out, + __global TYPE1_SCALAR* in1, + __global TYPE2_SCALAR* in2, + __global TYPE3_SCALAR* in3) +{ + size_t i = get_global_id(0); + + if (i + 1 < get_global_size(0)) + { + TYPE1 a = vload3(0, in1 + 3 * i); + TYPE2 b = vload3(0, in2 + 3 * i); + TYPE3 c = vload3(0, in3 + 3 * i); + RETTYPE res = )", builtin, R"((a, b, c); + vstore3(res, 0, out + 3 * i); + } + else + { + // Figure out how many elements are left over after + // BUFFER_SIZE % (3 * sizeof(type)). + // Assume power of two buffer size. + size_t parity = i & 1; + TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1); + TYPE2 b = (TYPE2)(UNDEF2, UNDEF2, UNDEF2); + TYPE3 c = (TYPE3)(UNDEF3, UNDEF3, UNDEF3); + switch (parity) + { + case 0: + a.y = in1[3 * i + 1]; + b.y = in2[3 * i + 1]; + c.y = in3[3 * i + 1]; + // fall through + case 1: + a.x = in1[3 * i]; + b.x = in2[3 * i]; + c.x = in3[3 * i]; + break; + } + + RETTYPE res = )", builtin, R"((a, b, c); + + switch (parity) + { + case 0: + out[3 * i + 1] = res.y; + // fall through + case 1: + out[3 * i] = res.x; + break; + } + } +} +)" }; + // clang-format on + + if (sizeValues[vector_size_index] != 3) + for (const auto &chunk : kernel_nonvec3) kernel << chunk; + else + for (const auto &chunk : kernel_vec3) kernel << chunk; + + return kernel.str(); +} diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h new file mode 100644 index 00000000..143814ca --- /dev/null +++ b/test_conformance/math_brute_force/common.h @@ -0,0 +1,68 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef COMMON_H +#define COMMON_H + +#include "harness/typeWrappers.h" +#include "utility.h" + +#include <array> +#include <string> +#include <vector> + +// Array of thread-specific kernels for each vector size. +using KernelMatrix = std::array<std::vector<cl_kernel>, VECTOR_SIZE_COUNT>; + +// Array of programs for each vector size. +using Programs = std::array<clProgramWrapper, VECTOR_SIZE_COUNT>; + +// Array of buffers for each vector size. +using Buffers = std::array<clMemWrapper, VECTOR_SIZE_COUNT>; + +// Types supported for kernel code generation. +enum class ParameterType +{ + Float, + Double, +}; + +// Return kernel name suffixed with vector size. +std::string GetKernelName(int vector_size_index); + +// Generate kernel code for the given builtin function/operator. +std::string GetTernaryKernel(const std::string &kernel_name, + const char *builtin, ParameterType retType, + ParameterType type1, ParameterType type2, + ParameterType type3, int vector_size_index); + +// Information to generate OpenCL kernels. +struct BuildKernelInfo +{ + // Number of kernels to build, one for each thread to avoid data races. + cl_uint threadCount; + + KernelMatrix &kernels; + + Programs &programs; + + // Function, macro or symbol tested by the kernel. + const char *nameInCode; + + // Whether to build with -cl-fast-relaxed-math. + bool relaxedMode; +}; + +#endif /* COMMON_H */ diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 3edbb485..91736285 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -53,6 +53,7 @@ STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } + #define unaryF NULL #define i_unaryF NULL #define unaryF_u NULL diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h index 38f739ce..95a29459 100644 --- a/test_conformance/math_brute_force/function_list.h +++ b/test_conformance/math_brute_force/function_list.h @@ -30,7 +30,7 @@ #include "harness/mt19937.h" -typedef union fptr { +union fptr { void *p; double (*f_f)(double); double (*f_u)(cl_uint); @@ -45,9 +45,9 @@ typedef union fptr { double (*f_ffpI)(double, double, int *); double (*f_fff)(double, double, double); float (*f_fma)(float, float, float, int); -} fptr; +}; -typedef union dptr { +union dptr { void *p; long double (*f_f)(long double); long double (*f_u)(cl_ulong); @@ -59,20 +59,20 @@ typedef union dptr { long double (*f_fpI)(long double, int *); long double (*f_ffpI)(long double, long double, int *); long double (*f_fff)(long double, long double, long double); -} dptr; +}; struct Func; -typedef struct vtbl +struct vtbl { const char *type_name; int (*TestFunc)(const struct Func *, MTdata, bool); int (*DoubleTestFunc)( const struct Func *, MTdata, bool); // may be NULL if function is single precision only -} vtbl; +}; -typedef struct Func +struct Func { const char *name; // common name, to be used as an argument in the shell const char *nameInCode; // name as it appears in the __kernel, usually the @@ -88,7 +88,7 @@ typedef struct Func int ftz; int relaxed; const vtbl *vtbl_ptr; -} Func; +}; extern const Func functionList[]; diff --git a/test_conformance/math_brute_force/i_unary_double.cpp b/test_conformance/math_brute_force/i_unary_double.cpp index 4383fa8b..0cbcf86e 100644 --- a/test_conformance/math_brute_force/i_unary_double.cpp +++ b/test_conformance/math_brute_force/i_unary_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -100,27 +104,28 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; int ftz = f->ftz || gForceFTZ; uint64_t step = getTestStep(sizeof(cl_double), BUFFER_SIZE); @@ -138,8 +143,8 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -244,7 +249,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) // If we aren't getting the correctly rounded result if (t[j] != q[j]) { - if (ftz && IsDoubleSubnormal(s[j])) + if ((ftz || relaxedMode) && IsDoubleSubnormal(s[j])) { unsigned int correct0 = f->dfunc.i_f(0.0); unsigned int correct1 = f->dfunc.i_f(-0.0); @@ -267,8 +272,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -295,7 +301,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp index c803aa32..90bb1e16 100644 --- a/test_conformance/math_brute_force/i_unary_float.cpp +++ b/test_conformance/math_brute_force/i_unary_float.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -98,27 +102,28 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE); @@ -135,8 +140,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -241,7 +246,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) // If we aren't getting the correctly rounded result if (t[j] != q[j]) { - if (ftz && IsFloatSubnormal(s[j])) + if ((ftz || relaxedMode) && IsFloatSubnormal(s[j])) { unsigned int correct0 = f->func.i_f(0.0); unsigned int correct1 = f->func.i_f(-0.0); @@ -264,8 +269,9 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -291,7 +297,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp index d09915f6..412f210b 100644 --- a/test_conformance/math_brute_force/macro_binary_double.cpp +++ b/test_conformance/math_brute_force/macro_binary_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -107,54 +111,55 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; - -typedef struct TestInfo + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values int ftz; // non-zero if running in flush to zero mode - -} TestInfo; + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +}; // A table of more difficult cases to get right -static const double specialValues[] = { +const double specialValues[] = { -NAN, -INFINITY, -DBL_MAX, @@ -264,182 +269,19 @@ static const double specialValues[] = { +0.0, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - TestInfo test_info; - cl_int error; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (size_t i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); dptr dfunc = job->f->dfunc; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; MTdata d = tinfo->d; cl_int error; const char *name = job->f->name; @@ -538,7 +380,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -613,7 +456,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { // If we aren't getting the correctly rounded result - if (ftz) + if (ftz || relaxedMode) { if (IsDoubleSubnormal(s[j])) { @@ -645,8 +488,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) cl_ulong err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld " - "vs. %lld (index: %d)\n", + vlog_error("\nERROR: %s: %" PRId64 + " ulp error at {%.13la, %.13la}: *%" PRId64 " " + "vs. %" PRId64 " (index: %zu)\n", name, err, ((double *)s)[j], ((double *)s2)[j], t[j], q[j], j); error = -1; @@ -654,13 +498,14 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) } - for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) + for (auto k = std::max(1U, gMinVectorSizeIndex); + k < gMaxVectorSizeIndex; k++) { q = (cl_long *)out[k]; // If we aren't getting the correctly rounded result if (-t[j] != q[j]) { - if (ftz) + if (ftz || relaxedMode) { if (IsDoubleSubnormal(s[j])) { @@ -692,8 +537,9 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) cl_ulong err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; - vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, " - "%.13la}: *%lld vs. %lld (index: %d)\n", + vlog_error("\nERROR: %sD%s: %" PRId64 " ulp error at {%.13la, " + "%.13la}: *%" PRId64 " vs. %" PRId64 + " (index: %zu)\n", name, sizeNames[k], err, ((double *)s)[j], ((double *)s2)[j], -t[j], q[j], j); error = -1; @@ -735,3 +581,131 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp index c530cdaf..cb915fc7 100644 --- a/test_conformance/math_brute_force/macro_binary_float.cpp +++ b/test_conformance/math_brute_force/macro_binary_float.cpp @@ -14,14 +14,17 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -105,54 +108,55 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; - -typedef struct TestInfo + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values int ftz; // non-zero if running in flush to zero mode - -} TestInfo; + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +}; // A table of more difficult cases to get right -static const float specialValues[] = { +const float specialValues[] = { -NAN, -INFINITY, -FLT_MAX, @@ -254,183 +258,19 @@ static const float specialValues[] = { +0.0f, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - TestInfo test_info; - cl_int error; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf2) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; MTdata d = tinfo->d; cl_int error; const char *name = job->f->name; @@ -531,7 +371,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); goto exit; } @@ -604,7 +445,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { - if (ftz) + if (ftz || relaxedMode) { if (IsFloatSubnormal(s[j])) { @@ -637,20 +478,21 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) uint32_t err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. " - "0x%8.8x (index: %d)\n", + "0x%8.8x (index: %zu)\n", name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j], j); error = -1; goto exit; } - for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) + for (auto k = std::max(1U, gMinVectorSizeIndex); + k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result if (-t[j] != q[j]) { - if (ftz) + if (ftz || relaxedMode) { if (IsFloatSubnormal(s[j])) { @@ -682,7 +524,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) cl_uint err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x " - "vs. 0x%8.8x (index: %d)\n", + "vs. 0x%8.8x (index: %zu)\n", name, sizeNames[k], err, ((float *)s)[j], ((float *)s2)[j], -t[j], q[j], j); error = -1; @@ -724,3 +566,132 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) exit: return error; } + +} // anonymous namespace + +int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp index 00e65a2c..c2e7cdcc 100644 --- a/test_conformance/math_brute_force/macro_unary_double.cpp +++ b/test_conformance/math_brute_force/macro_unary_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -101,210 +105,61 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + // Input and output buffers for the thread + clMemWrapper inBuf; + Buffers outBuf; -typedef struct TestInfo + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values int ftz; // non-zero if running in flush to zero mode + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +}; -} TestInfo; - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint scale = job->scale; cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); dptr dfunc = job->f->dfunc; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; cl_int error; const char *name = job->f->name; @@ -362,7 +217,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -430,7 +286,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { // If we aren't getting the correctly rounded result - if (ftz) + if (ftz || relaxedMode) { if (IsDoubleSubnormal(s[j])) { @@ -442,19 +298,21 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) cl_ulong err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n", + vlog_error("\nERROR: %sD: %" PRId64 + " ulp error at %.13la: *%" PRId64 " vs. %" PRId64 "\n", name, err, ((double *)gIn)[j], t[j], q[j]); return -1; } - for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++) + for (auto k = std::max(1U, gMinVectorSizeIndex); + k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result if (-t[j] != q[j]) { - if (ftz) + if (ftz || relaxedMode) { if (IsDoubleSubnormal(s[j])) { @@ -467,7 +325,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) cl_ulong err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; vlog_error( - "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n", + "\nERROR: %sD%s: %" PRId64 " ulp error at %.13la: *%" PRId64 + " vs. %" PRId64 "\n", name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]); return -1; } @@ -506,3 +365,119 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) return CL_SUCCESS; } + +} // anonymous namespace + +int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_float.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp index 3c1717ac..6a1b9b9a 100644 --- a/test_conformance/math_brute_force/macro_unary_float.cpp +++ b/test_conformance/math_brute_force/macro_unary_float.cpp @@ -14,14 +14,17 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -100,211 +103,61 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + // Input and output buffers for the thread + clMemWrapper inBuf; + Buffers outBuf; -typedef struct TestInfo + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. cl_uint scale; // stride between individual test values int ftz; // non-zero if running in flush to zero mode + bool relaxedMode; // True if test is running in relaxed mode, false + // otherwise. +}; -} TestInfo; - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfo test_info; - cl_int error; - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint scale = job->scale; cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; cl_int error = CL_SUCCESS; cl_int ret = CL_SUCCESS; const char *name = job->f->name; @@ -365,7 +218,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -435,7 +289,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { // If we aren't getting the correctly rounded result - if (ftz) + if (ftz || relaxedMode) { if (IsFloatSubnormal(s[j])) { @@ -454,14 +308,14 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) } - for (auto k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; - k++) + for (auto k = std::max(1U, gMinVectorSizeIndex); + k < gMaxVectorSizeIndex; k++) { q = out[k]; // If we aren't getting the correctly rounded result if (-t[j] != q[j]) { - if (ftz) + if (ftz || relaxedMode) { if (IsFloatSubnormal(s[j])) { @@ -521,3 +375,120 @@ exit: return ret; } + +} // anonymous namespace + +int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/mad_double.cpp b/test_conformance/math_brute_force/mad_double.cpp index a32cd5a8..8d8fec52 100644 --- a/test_conformance/math_brute_force/mad_double.cpp +++ b/test_conformance/math_brute_force/mad_double.cpp @@ -14,126 +14,49 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2, __global double", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in, __global double* in2, " - "__global double* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " double3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " double3 d2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " d2 = (double3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); +namespace { - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + auto kernel_name = GetKernelName(vectorSize); + auto source = GetTernaryKernel(kernel_name, name, ParameterType::Double, + ParameterType::Double, ParameterType::Double, + ParameterType::Double, vectorSize); + std::array<const char *, 1> sources{ source.c_str() }; + return MakeKernel(sources.data(), sources.size(), kernel_name.c_str(), k, p, + relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; double maxErrorVal = 0.0f; @@ -145,8 +68,8 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -294,7 +217,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp index 095a22ff..04ac5aa6 100644 --- a/test_conformance/math_brute_force/mad_float.cpp +++ b/test_conformance/math_brute_force/mad_float.cpp @@ -14,127 +14,52 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in1, __global float", - sizeNames[vectorSize], - "* in2, __global float", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global float* in, __global float* in2, " - "__global float* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " float3 f2 = vload3( 0, in3 + 3 * i );\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " float3 f1;\n" - " float3 f2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " f2 = (float3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); +namespace { - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + auto kernel_name = GetKernelName(vectorSize); + auto source = GetTernaryKernel(kernel_name, name, ParameterType::Float, + ParameterType::Float, ParameterType::Float, + ParameterType::Float, vectorSize); + std::array<const char *, 1> sources{ source.c_str() }; + return MakeKernel(sources.data(), sources.size(), kernel_name.c_str(), k, p, + relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode) { int error; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; float maxErrorVal = 0.0f; @@ -144,8 +69,8 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -293,7 +218,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index d6c2f11f..64491bd4 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -18,6 +18,7 @@ #include "sleep.h" #include "utility.h" +#include <algorithm> #include <cstdio> #include <cstdlib> #include <ctime> @@ -57,8 +58,8 @@ static char appName[MAXPATHLEN] = ""; cl_device_id gDevice = NULL; cl_context gContext = NULL; cl_command_queue gQueue = NULL; -static int32_t gStartTestNumber = -1; -static int32_t gEndTestNumber = -1; +static size_t gStartTestNumber = ~0u; +static size_t gEndTestNumber = ~0u; int gSkipCorrectnessTesting = 0; static int gStopOnError = 0; static bool gSkipRestOfTests; @@ -97,7 +98,7 @@ cl_mem gInBuffer2 = NULL; cl_mem gInBuffer3 = NULL; cl_mem gOutBuffer[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; cl_mem gOutBuffer2[VECTOR_SIZE_COUNT] = { NULL, NULL, NULL, NULL, NULL, NULL }; -static MTdata gMTdata; +static MTdataHolder gMTdata; cl_device_fp_config gFloatCapabilities = 0; int gWimpyReductionFactor = 32; int gVerboseBruteForce = 0; @@ -128,10 +129,10 @@ static int doTest(const char *name) const Func *const temp_func = functionList + i; if (strcmp(temp_func->name, name) == 0) { - if ((gStartTestNumber != -1 && i < gStartTestNumber) + if ((gStartTestNumber != ~0u && i < gStartTestNumber) || i > gEndTestNumber) { - vlog("Skipping function #%d\n", i); + vlog("Skipping function #%zu\n", i); return 0; } @@ -167,7 +168,6 @@ static int doTest(const char *name) } { - extern int my_ilogb(double); if (0 == strcmp("ilogb", func_data->name)) { InitILogbConstants(); @@ -326,7 +326,7 @@ int main(int argc, const char *argv[]) vlog("\n-------------------------------------------------------------------" "----------------------------------------\n"); - gMTdata = init_genrand(gRandomSeed); + gMTdata = MTdataHolder(gRandomSeed); FPU_mode_type oldMode; DisableFTZ(&oldMode); @@ -336,8 +336,6 @@ int main(int argc, const char *argv[]) RestoreFPState(&oldMode); - free_mtdata(gMTdata); - if (gQueue) { int error_code = clFinish(gQueue); @@ -360,16 +358,18 @@ static int ParseArgs(int argc, const char **argv) int singleThreaded = 0; { // Extract the app name - strncpy(appName, argv[0], MAXPATHLEN); + strncpy(appName, argv[0], MAXPATHLEN - 1); + appName[MAXPATHLEN - 1] = '\0'; #if defined(__APPLE__) char baseName[MAXPATHLEN]; char *base = NULL; - strncpy(baseName, argv[0], MAXPATHLEN); + strncpy(baseName, argv[0], MAXPATHLEN - 1); + baseName[MAXPATHLEN - 1] = '\0'; base = basename(baseName); if (NULL != base) { - strncpy(appName, base, sizeof(appName)); + strncpy(appName, base, sizeof(appName) - 1); appName[sizeof(appName) - 1] = '\0'; } #endif @@ -467,7 +467,7 @@ static int ParseArgs(int argc, const char **argv) long number = strtol(arg, &t, 0); if (t != arg) { - if (-1 == gStartTestNumber) + if (~0u == gStartTestNumber) gStartTestNumber = (int32_t)number; else gEndTestNumber = gStartTestNumber + (int32_t)number; @@ -502,8 +502,6 @@ static int ParseArgs(int argc, const char **argv) gWimpyMode = 1; } - vlog("\nTest binary built %s %s\n", __DATE__, __TIME__); - PrintArch(); if (gWimpyMode) @@ -524,7 +522,7 @@ static int ParseArgs(int argc, const char **argv) static void PrintFunctions(void) { vlog("\nMath function names:\n"); - for (int i = 0; i < functionListCount; i++) + for (size_t i = 0; i < functionListCount; i++) { vlog("\t%s\n", functionList[i].name); } @@ -1056,8 +1054,6 @@ int MakeKernels(const char **c, cl_uint count, const char *name, cl_uint kernel_count, cl_kernel *k, cl_program *p, bool relaxedMode) { - int error = 0; - cl_uint i; char options[200] = ""; if (gForceFTZ) @@ -1075,7 +1071,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name, strcat(options, " -cl-fast-relaxed-math"); } - error = + int error = create_single_kernel_helper(gContext, p, NULL, count, c, NULL, options); if (error != CL_SUCCESS) { @@ -1083,9 +1079,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name, return error; } - - memset(k, 0, kernel_count * sizeof(*k)); - for (i = 0; i < kernel_count; i++) + for (cl_uint i = 0; i < kernel_count; i++) { k[i] = clCreateKernel(*p, name, &error); if (NULL == k[i] || error) @@ -1096,7 +1090,6 @@ int MakeKernels(const char **c, cl_uint count, const char *name, clGetProgramBuildInfo(*p, gDevice, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); vlog_error("Log: %s\n", buffer); - clReleaseProgram(*p); return error; } } @@ -1244,7 +1237,7 @@ float Bruteforce_Ulp_Error_Double(double test, long double reference) // The unbiased exponent of the ulp unit place int ulp_exp = - DBL_MANT_DIG - 1 - MAX(ilogbl(reference), DBL_MIN_EXP - 1); + DBL_MANT_DIG - 1 - std::max(ilogbl(reference), DBL_MIN_EXP - 1); // Scale the exponent of the error float result = (float)scalbnl(testVal - reference, ulp_exp); @@ -1260,7 +1253,7 @@ float Bruteforce_Ulp_Error_Double(double test, long double reference) // reference is a normal power of two or a zero // The unbiased exponent of the ulp unit place int ulp_exp = - DBL_MANT_DIG - 1 - MAX(ilogbl(reference) - 1, DBL_MIN_EXP - 1); + DBL_MANT_DIG - 1 - std::max(ilogbl(reference) - 1, DBL_MIN_EXP - 1); // allow correctly rounded results to pass through unmolested. (We might add // error to it below.) There is something of a performance optimization here diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 3a6516ba..afa072f8 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -41,10 +41,10 @@ #pragma STDC FP_CONTRACT OFF static void __log2_ep(double *hi, double *lo, double x); -typedef union { +union uint64d_t { uint64_t i; double d; -} uint64d_t; +}; static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL }; @@ -1949,7 +1949,8 @@ double reference_lgamma(double x) w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ static const double zero = 0.00000000000000000000e+00; - double t, y, z, nadj, p, p1, p2, p3, q, r, w; + double nadj = zero; + double t, y, z, p, p1, p2, p3, q, r, w; cl_int i, hx, lx, ix; union { @@ -2259,10 +2260,10 @@ long double reference_dividel(long double x, long double y) return dx / dy; } -typedef struct +struct double_double { double hi, lo; -} double_double; +}; // Split doubles_double into a series of consecutive 26-bit precise doubles and // a remainder. Note for later -- for multiplication, it might be better to @@ -2321,7 +2322,7 @@ static inline double_double accum_d(double_double a, double b) static inline double_double add_dd(double_double a, double_double b) { - double_double r = { -0.0 - 0.0 }; + double_double r = { -0.0, -0.0 }; if (isinf(a.hi) || isinf(b.hi) || isnan(a.hi) || isnan(b.hi) || 0.0 == a.hi || 0.0 == b.hi) @@ -3767,10 +3768,10 @@ static uint32_t two_over_pi[] = { static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, 0x26263314, 0x1701b839, 0x28948127 }; -typedef union { +union d_ui64_t { uint64_t u; double d; -} d_ui64_t; +}; // radix or base of representation #define RADIX (30) @@ -3786,13 +3787,13 @@ d_ui64_t two_pow_two_mradix = { (uint64_t)(1023 - 2 * RADIX) << 52 }; // extended fixed point representation of double precision // floating point number. // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ] -typedef struct +struct eprep_t { uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in // base_30 int index; // exponent bias int sign; // sign of double -} eprep_t; +}; static eprep_t double_to_eprep(double x) { @@ -4549,8 +4550,8 @@ long double reference_powl(long double x, long double y) if (x != x || y != y) return x + y; // do the work required to sort out edge cases - double fabsy = reference_fabs(y); - double fabsx = reference_fabs(x); + double fabsy = (double)reference_fabsl(y); + double fabsx = (double)reference_fabsl(x); double iy = reference_rint( fabsy); // we do round to nearest here so that |fy| <= 0.5 if (iy > fabsy) // convert nearbyint to floor @@ -4637,13 +4638,13 @@ long double reference_powl(long double x, long double y) // compute product of y*log2(x) // scale to avoid overflow in double-double multiplication - if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970)) + if (fabsy > HEX_DBL(+, 1, 0, +, 970)) { y_hi = reference_ldexp(y_hi, -53); y_lo = reference_ldexp(y_lo, -53); } MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); - if (fabs(y) > HEX_DBL(+, 1, 0, +, 970)) + if (fabsy > HEX_DBL(+, 1, 0, +, 970)) { ylog2x_hi = reference_ldexp(ylog2x_hi, 53); ylog2x_lo = reference_ldexp(ylog2x_lo, 53); @@ -5357,10 +5358,10 @@ long double reference_acosl(long double x) 0x3243F6A8885A308DULL, 0x313198A2E0370734ULL }; // first 126 bits of pi // http://www.super-computing.org/pi-hexa_current.html - long double head, tail, temp; + long double head, tail; #if __LDBL_MANT_DIG__ >= 64 // long double has 64-bits of precision or greater - temp = (long double)pi_bits[0] * 0x1.0p64L; + long double temp = (long double)pi_bits[0] * 0x1.0p64L; head = temp + (long double)pi_bits[1]; temp -= head; // rounding err rounding pi_bits[1] into head tail = (long double)pi_bits[1] + temp; diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp index 606fdc5a..b5f1ab09 100644 --- a/test_conformance/math_brute_force/ternary_double.cpp +++ b/test_conformance/math_brute_force/ternary_double.cpp @@ -14,127 +14,49 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> #define CORRECTLY_ROUNDED 0 #define FLUSHED 1 -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double", - sizeNames[vectorSize], - "* out, __global double", - sizeNames[vectorSize], - "* in1, __global double", - sizeNames[vectorSize], - "* in2, __global double", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global double* out, __global double* in, __global double* in2, " - "__global double* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " double3 d0 = vload3( 0, in + 3 * i );\n" - " double3 d1 = vload3( 0, in2 + 3 * i );\n" - " double3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " double3 d0;\n" - " double3 d1;\n" - " double3 d2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (double3)( in[3*i], NAN, NAN ); \n" - " d1 = (double3)( in2[3*i], NAN, NAN ); \n" - " d2 = (double3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n" - " d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); +namespace { - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + auto kernel_name = GetKernelName(vectorSize); + auto source = GetTernaryKernel(kernel_name, name, ParameterType::Double, + ParameterType::Double, ParameterType::Double, + ParameterType::Double, vectorSize); + std::array<const char *, 1> sources{ source.c_str() }; + return MakeKernel(sources.data(), sources.size(), kernel_name.c_str(), k, p, + relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } // A table of more difficult cases to get right -static const double specialValues[] = { +const double specialValues[] = { -NAN, -INFINITY, -DBL_MAX, @@ -202,14 +124,16 @@ static const double specialValues[] = { +0.0, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); +} // anonymous namespace + int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ; @@ -224,8 +148,8 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -387,7 +311,7 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, float err = Bruteforce_Ulp_Error_Double(test, correct); int fail = !(fabsf(err) <= f->double_ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { // retry per section 6.5.3.2 if (IsDoubleSubnormal(correct)) @@ -704,8 +628,9 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d, { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -733,7 +658,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp index e52c0a0f..cf361841 100644 --- a/test_conformance/math_brute_force/ternary_float.cpp +++ b/test_conformance/math_brute_force/ternary_float.cpp @@ -14,125 +14,49 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> #define CORRECTLY_ROUNDED 0 #define FLUSHED 1 -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float", - sizeNames[vectorSize], - "* out, __global float", - sizeNames[vectorSize], - "* in1, __global float", - sizeNames[vectorSize], - "* in2, __global float", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " size_t i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - - const char *c3[] = { - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global float* out, __global float* in, __global float* in2, " - "__global float* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " float3 f0 = vload3( 0, in + 3 * i );\n" - " float3 f1 = vload3( 0, in2 + 3 * i );\n" - " float3 f2 = vload3( 0, in3 + 3 * i );\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " float3 f0;\n" - " float3 f1;\n" - " float3 f2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (float3)( in[3*i], NAN, NAN ); \n" - " f1 = (float3)( in2[3*i], NAN, NAN ); \n" - " f2 = (float3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n" - " f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0, f1, f2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); +namespace { - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + auto kernel_name = GetKernelName(vectorSize); + auto source = GetTernaryKernel(kernel_name, name, ParameterType::Float, + ParameterType::Float, ParameterType::Float, + ParameterType::Float, vectorSize); + std::array<const char *, 1> sources{ source.c_str() }; + return MakeKernel(sources.data(), sources.size(), kernel_name.c_str(), k, p, + relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } // A table of more difficult cases to get right -static const float specialValues[] = { +const float specialValues[] = { -NAN, -INFINITY, -FLT_MAX, @@ -210,16 +134,18 @@ static const float specialValues[] = { +0.0f, }; -static const size_t specialValuesCount = +constexpr size_t specialValuesCount = sizeof(specialValues) / sizeof(specialValues[0]); +} // anonymous namespace + int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) { int error; logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); @@ -240,8 +166,8 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -439,7 +365,7 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) err = Ulp_Error(test, correct); fail = !(fabsf(err) <= float_ulps); - if (fail && ftz) + if (fail && (ftz || relaxedMode)) { float correct2, err2; @@ -839,8 +765,8 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -868,7 +794,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp index f6fa3264..177cfe5b 100644 --- a/test_conformance/math_brute_force/unary_double.cpp +++ b/test_conformance/math_brute_force/unary_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -101,44 +105,44 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value. Init to 0. - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; -typedef struct TestInfo + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -151,185 +155,21 @@ typedef struct TestInfo float half_sin_cos_tan_limit; bool relaxedMode; // True if test is running in relaxed mode, false // otherwise. -} TestInfo; - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); +}; -int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - - logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_double)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->double_ulps; - test_info.ftz = f->ftz || gForceFTZ; - test_info.relaxedMode = relaxedMode; - - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_double), - test_info.subBufferSize * sizeof(cl_double) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ %a", maxError, maxErrorVal); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_double); cl_uint scale = job->scale; cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); float ulps = job->ulps; dptr func = job->f->dfunc; cl_int error; int ftz = job->ftz; + bool relaxedMode = job->relaxedMode; Force64BitFPUPrecision(); @@ -385,7 +225,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -463,7 +304,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, ulps)) @@ -505,7 +346,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { vlog_error("\nERROR: %s%s: %f ulp error at %.13la " - "(0x%16.16llx): *%.13la vs. %.13la\n", + "(0x%16.16" PRIx64 "): *%.13la vs. %.13la\n", job->f->name, sizeNames[k], err, ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j], ((cl_double *)gOut_Ref)[j], test); @@ -547,3 +388,133 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) return CL_SUCCESS; } + +} // anonymous namespace + +int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_double), relaxedMode); + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_double)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->double_ulps; + test_info.ftz = f->ftz || gForceFTZ; + test_info.relaxedMode = relaxedMode; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_double), + test_info.subBufferSize * sizeof(cl_double) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ %a", maxError, maxErrorVal); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp index 17edc58d..4c1f1a1d 100644 --- a/test_conformance/math_brute_force/unary_float.cpp +++ b/test_conformance/math_brute_force/unary_float.cpp @@ -14,14 +14,17 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, - cl_kernel *k, cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, + cl_kernel *k, cl_program *p, bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -99,44 +102,44 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count, relaxedMode); } -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; - -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, info->relaxedMode); + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->threadCount, + info->kernels[vectorSize].data(), + &(info->programs[vectorSize]), info->relaxedMode); } // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + // Input and output buffers for the thread + clMemWrapper inBuf; + Buffers outBuf; + float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value. Init to 0. - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; -typedef struct TestInfo + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo { size_t subBufferSize; // Size of the sub-buffer in elements const Func *f; // A pointer to the function info - cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes - cl_kernel - *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each - // worker thread: k[vector_size][thread_id] - ThreadInfo * - tinfo; // An array of thread specific information for each worker thread + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector<ThreadInfo> tinfo; + cl_uint threadCount; // Number of worker threads cl_uint jobCount; // Number of jobs cl_uint step; // step between each chunk and the next. @@ -149,207 +152,16 @@ typedef struct TestInfo float half_sin_cos_tan_limit; bool relaxedMode; // True if test is running in relaxed mode, false // otherwise. -} TestInfo; +}; -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data); - -int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode) +cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { - TestInfo test_info; - cl_int error; - float maxError = 0.0f; - double maxErrorVal = 0.0; - int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0); - - logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); - - // Init test_info - memset(&test_info, 0, sizeof(test_info)); - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_float)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); - test_info.relaxedMode = relaxedMode; - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { - i * test_info.subBufferSize * sizeof(cl_float), - test_info.subBufferSize * sizeof(cl_float) - }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - goto exit; - } - - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of " - "gOutBuffer[%d] for region {%zd, %zd}\n", - (int)j, region.origin, region.size); - goto exit; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - goto exit; - } - } - - // Check for special cases for unary float - test_info.isRangeLimited = 0; - test_info.half_sin_cos_tan_limit = 0; - if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = 1.0f - + test_info.ulps - * (FLT_EPSILON / 2.0f); // out of range results from finite - // inputs must be in [-1,1] - } - else if (0 == strcmp(f->name, "half_tan")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = - INFINITY; // out of range resut from finite inputs must be numeric - } - - // Init the kernels - { - BuildKernelInfo build_info = { - gMinVectorSizeIndex, test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode, relaxedMode - }; - if ((error = ThreadPool_Do(BuildKernelFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) - goto exit; - } - - // Run the kernels - if (!gSkipCorrectnessTesting || skipTestingRelaxed) - { - error = ThreadPool_Do(Test, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - } - } - - if (error) goto exit; - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - if (skipTestingRelaxed) - { - vlog(" (rlx skip correctness testing)\n"); - goto exit; - } - - vlog("\t%8.2f @ %a", maxError, maxErrorVal); - } - - vlog("\n"); - -exit: - // Release - for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (cl_uint j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (cl_uint i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } - - return error; -} - -static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) -{ - const TestInfo *job = (const TestInfo *)data; + TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; size_t buffer_size = buffer_elements * sizeof(cl_float); cl_uint scale = job->scale; cl_uint base = job_id * (cl_uint)job->step; - ThreadInfo *tinfo = job->tinfo + thread_id; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); fptr func = job->f->func; const char *fname = job->f->name; bool relaxedMode = job->relaxedMode; @@ -440,7 +252,8 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -619,7 +432,7 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { - if (ftz) + if (ftz || relaxedMode) { typedef int (*CheckForSubnormal)( double, float); // If we are in fast relaxed math, @@ -725,3 +538,159 @@ static cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) return CL_SUCCESS; } + +} // anonymous namespace + +int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0); + + logFunctionInfo(f->name, sizeof(cl_float), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_float)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); + test_info.relaxedMode = relaxedMode; + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + test_info.k[i].resize(test_info.threadCount, nullptr); + } + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { + i * test_info.subBufferSize * sizeof(cl_float), + test_info.subBufferSize * sizeof(cl_float) + }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting || skipTestingRelaxed) + { + error = ThreadPool_Do(Test, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + if (skipTestingRelaxed) + { + vlog(" (rlx skip correctness testing)\n"); + goto exit; + } + + vlog("\t%8.2f @ %a", maxError, maxErrorVal); + } + + vlog("\n"); + +exit: + // Release + for (auto i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + for (auto &kernel : test_info.k[i]) + { + clReleaseKernel(kernel); + } + } + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp index 71dd4f44..6d7c61d6 100644 --- a/test_conformance/math_brute_force/unary_two_results_double.cpp +++ b/test_conformance/math_brute_force/unary_two_results_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -107,27 +111,28 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError0 = 0.0f; float maxError1 = 0.0f; @@ -144,8 +149,8 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -287,7 +292,7 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) float err2 = Bruteforce_Ulp_Error_Double(test2, correct2); int fail = !(fabsf(err) <= f->double_ulps && fabsf(err2) <= f->double_ulps); - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, f->double_ulps)) @@ -410,8 +415,9 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -439,7 +445,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_two_results_float.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp index 4a375ce3..42e858c4 100644 --- a/test_conformance/math_brute_force/unary_two_results_float.cpp +++ b/test_conformance/math_brute_force/unary_two_results_float.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -105,27 +109,28 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError0 = 0.0f; float maxError1 = 0.0f; @@ -143,8 +148,8 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) float float_ulps = getAllowedUlpError(f, relaxedMode); // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -254,7 +259,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) { // Calculate the correctly rounded reference result memset(&oldMode, 0, sizeof(oldMode)); - if (ftz) ForceFTZ(&oldMode); + if (ftz || relaxedMode) ForceFTZ(&oldMode); // Set the rounding mode to match the device if (gIsInRTZMode) @@ -381,7 +386,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) int fail = !(fabsf(err) <= float_ulps && fabsf(err2) <= float_ulps); - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if ((*isFloatResultSubnormalPtr)(correct, float_ulps)) @@ -542,8 +547,9 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -571,7 +577,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_two_results_i_double.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp index 14d1fb99..8b751944 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp @@ -14,15 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -108,33 +112,34 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } -static cl_ulong abs_cl_long(cl_long i) +cl_ulong abs_cl_long(cl_long i) { cl_long mask = i >> 63; return (i ^ mask) - mask; } +} // anonymous namespace + int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; @@ -152,8 +157,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -290,7 +295,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) cl_long iErr = (long long)q2[j] - (long long)correct2; int fail = !(fabsf(err) <= f->double_ulps && abs_cl_long(iErr) <= maxiError); - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, f->double_ulps)) @@ -382,8 +387,9 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -400,8 +406,8 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode) else vlog("passed"); - vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal, - maxErrorVal2); + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); } vlog("\n"); @@ -411,7 +417,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp index 23b0d707..54843a29 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp @@ -14,15 +14,19 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <climits> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -106,33 +110,34 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } -static cl_ulong abs_cl_long(cl_long i) +cl_ulong abs_cl_long(cl_long i) { cl_long mask = i >> 63; return (i ^ mask) - mask; } +} // anonymous namespace + int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int64_t maxError2 = 0; @@ -155,8 +160,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -293,7 +298,7 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) cl_long iErr = (int64_t)q2[j] - (int64_t)correct2; int fail = !(fabsf(err) <= float_ulps && abs_cl_long(iErr) <= maxiError); - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, float_ulps)) @@ -380,8 +385,9 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -398,8 +404,8 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode) else vlog("passed"); - vlog("\t{%8.2f, %lld} @ {%a, %a}", maxError, maxError2, maxErrorVal, - maxErrorVal2); + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); } vlog("\n"); @@ -409,7 +415,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp index 3c5f99da..9b60904a 100644 --- a/test_conformance/math_brute_force/unary_u_double.cpp +++ b/test_conformance/math_brute_force/unary_u_double.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n", "__kernel void math_kernel", @@ -102,32 +106,33 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } -static cl_ulong random64(MTdata d) +cl_ulong random64(MTdata d) { return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32); } +} // anonymous namespace + int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ; @@ -140,8 +145,8 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -245,7 +250,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) if (fail) { - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsDoubleResultSubnormal(correct, @@ -263,11 +268,11 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) } if (fail) { - vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: " - "*%.13la vs. %.13la\n", - f->name, sizeNames[k], err, - ((uint64_t *)gIn)[j], - ((double *)gOut_Ref)[j], test); + vlog_error( + "\n%s%sD: %f ulp error at 0x%16.16" PRIx64 ": " + "*%.13la vs. %.13la\n", + f->name, sizeNames[k], err, ((uint64_t *)gIn)[j], + ((double *)gOut_Ref)[j], test); error = -1; goto exit; } @@ -279,8 +284,9 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -307,7 +313,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/unary_u_float.cpp b/test_conformance/math_brute_force/unary_u_float.cpp index 44c5af47..b67a9bda 100644 --- a/test_conformance/math_brute_force/unary_u_float.cpp +++ b/test_conformance/math_brute_force/unary_u_float.cpp @@ -14,14 +14,18 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" +#include <cinttypes> #include <cstring> -static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) +namespace { + +int BuildKernel(const char *name, int vectorSize, cl_kernel *k, cl_program *p, + bool relaxedMode) { const char *c[] = { "__kernel void math_kernel", sizeNames[vectorSize], @@ -99,27 +103,28 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k, return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); } -typedef struct BuildKernelInfo +struct BuildKernelInfo2 { - cl_uint offset; // the first vector size to build cl_kernel *kernels; - cl_program *programs; + Programs &programs; const char *nameInCode; bool relaxedMode; // Whether to build with -cl-fast-relaxed-math. -} BuildKernelInfo; +}; -static cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernel(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); + BuildKernelInfo2 *info = (BuildKernelInfo2 *)p; + cl_uint vectorSize = gMinVectorSizeIndex + job_id; + return BuildKernel(info->nameInCode, vectorSize, info->kernels + vectorSize, + &(info->programs[vectorSize]), info->relaxedMode); } +} // anonymous namespace + int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) { int error; - cl_program programs[VECTOR_SIZE_COUNT]; + Programs programs; cl_kernel kernels[VECTOR_SIZE_COUNT]; float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities); @@ -137,8 +142,8 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) // Init the kernels { - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode, relaxedMode }; + BuildKernelInfo2 build_info{ kernels, programs, f->nameInCode, + relaxedMode }; if ((error = ThreadPool_Do(BuildKernelFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -249,7 +254,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) if (fail) { - if (ftz) + if (ftz || relaxedMode) { // retry per section 6.5.3.2 if (IsFloatResultSubnormal(correct, float_ulps)) @@ -281,8 +286,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - BUFFER_SIZE); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); } else { @@ -309,7 +315,6 @@ exit: for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); } return error; diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index ac4db9c8..b4a59edb 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -90,8 +90,7 @@ int MakeKernels(const char **c, cl_uint count, const char *name, bool relaxedMode); // used to convert a bucket of bits into a search pattern through double -static inline double DoubleFromUInt32(uint32_t bits); -static inline double DoubleFromUInt32(uint32_t bits) +inline double DoubleFromUInt32(uint32_t bits) { union { uint64_t u; @@ -117,25 +116,25 @@ void _LogBuildError(cl_program p, int line, const char *file); // premature flushing to zero. // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN + // ulp_limit to be flushed to zero. -static inline int IsFloatResultSubnormal(double x, float ulps) +inline int IsFloatResultSubnormal(double x, float ulps) { x = fabs(x) - MAKE_HEX_DOUBLE(0x1.0p-149, 0x1, -149) * (double)ulps; return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } -static inline int IsFloatResultSubnormalAbsError(double x, float abs_err) +inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { x = x - abs_err; return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } -static inline int IsDoubleResultSubnormal(long double x, float ulps) +inline int IsDoubleResultSubnormal(long double x, float ulps) { x = fabsl(x) - MAKE_HEX_LONG(0x1.0p-1074, 0x1, -1074) * (long double)ulps; return x < MAKE_HEX_LONG(0x1.0p-1022, 0x1, -1022); } -static inline int IsFloatInfinity(double x) +inline int IsFloatInfinity(double x) { union { cl_float d; @@ -145,7 +144,7 @@ static inline int IsFloatInfinity(double x) return ((u.u & 0x7fffffffU) == 0x7F800000U); } -static inline int IsFloatMaxFloat(double x) +inline int IsFloatMaxFloat(double x) { union { cl_float d; @@ -155,7 +154,7 @@ static inline int IsFloatMaxFloat(double x) return ((u.u & 0x7fffffffU) == 0x7F7FFFFFU); } -static inline int IsFloatNaN(double x) +inline int IsFloatNaN(double x) { union { cl_float d; @@ -165,13 +164,13 @@ static inline int IsFloatNaN(double x) return ((u.u & 0x7fffffffU) > 0x7F800000U); } -extern cl_uint RoundUpToNextPowerOfTwo(cl_uint x); +cl_uint RoundUpToNextPowerOfTwo(cl_uint x); // Windows (since long double got deprecated) sets the x87 to 53-bit precision // (that's x87 default state). This causes problems with the tests that // convert long and ulong to float and double or otherwise deal with values // that need more precision than 53-bit. So, set the x87 to 64-bit precision. -static inline void Force64BitFPUPrecision(void) +inline void Force64BitFPUPrecision(void) { #if __MINGW32__ // The usual method is to use _controlfp as follows: @@ -202,17 +201,17 @@ static inline void Force64BitFPUPrecision(void) #endif } -extern void memset_pattern4(void *dest, const void *src_pattern, size_t bytes); +void memset_pattern4(void *dest, const void *src_pattern, size_t bytes); -typedef union { +union int32f_t { int32_t i; float f; -} int32f_t; +}; -typedef union { +union int64d_t { int64_t l; double d; -} int64d_t; +}; void MulD(double *rhi, double *rlo, double u, double v); void AddD(double *rhi, double *rlo, double a, double b); @@ -229,7 +228,7 @@ void logFunctionInfo(const char *fname, unsigned int float_size, float getAllowedUlpError(const Func *f, const bool relaxed); -static inline cl_uint getTestScale(size_t typeSize) +inline cl_uint getTestScale(size_t typeSize) { if (gWimpyMode) { @@ -245,7 +244,7 @@ static inline cl_uint getTestScale(size_t typeSize) } } -static inline uint64_t getTestStep(size_t typeSize, size_t bufferSize) +inline uint64_t getTestStep(size_t typeSize, size_t bufferSize) { if (gWimpyMode) { diff --git a/test_conformance/multiple_device_context/test_multiple_devices.cpp b/test_conformance/multiple_device_context/test_multiple_devices.cpp index 59543ade..4f187b9c 100644 --- a/test_conformance/multiple_device_context/test_multiple_devices.cpp +++ b/test_conformance/multiple_device_context/test_multiple_devices.cpp @@ -175,9 +175,8 @@ int test_device_set(size_t deviceCount, size_t queueCount, cl_device_id *devices } /* All done now! */ - if (errors) - return -1; - return 0; + if (errors) return -1; + return 0; } int test_two_devices(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) diff --git a/test_conformance/pipes/test_pipe_info.cpp b/test_conformance/pipes/test_pipe_info.cpp index 7543c6cd..e7b486db 100644 --- a/test_conformance/pipes/test_pipe_info.cpp +++ b/test_conformance/pipes/test_pipe_info.cpp @@ -14,6 +14,7 @@ // limitations under the License. // #include "procs.h" +#include "harness/parseParameters.h" const char* pipe_kernel_code = { "__kernel void pipe_kernel(__write_only pipe int out_pipe)\n" @@ -39,8 +40,7 @@ int test_pipe_info( cl_device_id deviceID, cl_context context, cl_command_queue if (pipe_width != returnVal) { - log_error("Error in clGetPipeInfo() check of pipe packet size\n"); - return -1; + test_fail("Error in clGetPipeInfo() check of pipe packet size\n"); } else { @@ -52,29 +52,37 @@ int test_pipe_info( cl_device_id deviceID, cl_context context, cl_command_queue if(pipe_depth != returnVal) { - log_error( "Error in clGetPipeInfo() check of pipe max packets\n" ); - return -1; + test_fail("Error in clGetPipeInfo() check of pipe max packets\n"); } else { log_info( " CL_PIPE_MAX_PACKETS passed.\n" ); } - err = create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, (const char**)&pipe_kernel_code, "pipe_kernel", "-cl-std=CL2.0 -cl-kernel-arg-info"); - test_error_ret(err, " Error creating program", -1); + err = create_single_kernel_helper_with_build_options( + context, &program, &kernel, 1, &pipe_kernel_code, "pipe_kernel", + "-cl-std=CL2.0 -cl-kernel-arg-info"); + test_error_fail(err, "Error creating program"); cl_kernel_arg_type_qualifier arg_type_qualifier = 0; - cl_kernel_arg_type_qualifier expected_type_qualifier = CL_KERNEL_ARG_TYPE_PIPE; - err = clGetKernelArgInfo( kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER, sizeof(arg_type_qualifier), &arg_type_qualifier, NULL ); - test_error_ret(err, " clSetKernelArgInfo failed", -1); - err = (arg_type_qualifier != expected_type_qualifier); - - if(err) + err = clGetKernelArgInfo(kernel, 0, CL_KERNEL_ARG_TYPE_QUALIFIER, + sizeof(arg_type_qualifier), &arg_type_qualifier, + NULL); + if (gCompilationMode == kOnline) { - print_error(err, "ERROR: Bad type qualifier\n"); - return -1; + test_error_fail(err, "clGetKernelArgInfo failed"); + if (arg_type_qualifier != CL_KERNEL_ARG_TYPE_PIPE) + { + test_fail("ERROR: Incorrect type qualifier: %i\n", + arg_type_qualifier); + } + } + else + { + test_failure_error_ret(err, CL_KERNEL_ARG_INFO_NOT_AVAILABLE, + "clGetKernelArgInfo error not as expected", + TEST_FAIL); } - return err; - + return TEST_PASS; } diff --git a/test_conformance/pipes/test_pipe_limits.cpp b/test_conformance/pipes/test_pipe_limits.cpp index 169ab80c..e1048f5f 100644 --- a/test_conformance/pipes/test_pipe_limits.cpp +++ b/test_conformance/pipes/test_pipe_limits.cpp @@ -69,7 +69,7 @@ void createKernelSourceCode(std::stringstream &stream, int num_pipes) } } )"; - // clang-format om + // clang-format on } stream << R"( } @@ -163,7 +163,7 @@ int test_pipe_max_args(cl_device_id deviceID, cl_context context, cl_command_que cl_int err; cl_int size; int num_pipe_elements = 1024; - int i, j; + int i; int max_pipe_args; std::stringstream source; clEventWrapper producer_sync_event = NULL; @@ -648,4 +648,4 @@ int test_pipe_max_active_reservations(cl_device_id deviceID, cl_context context, } return 0; -}
\ No newline at end of file +} diff --git a/test_conformance/pipes/test_pipe_read_write.cpp b/test_conformance/pipes/test_pipe_read_write.cpp index dd0d1216..425c7aee 100644 --- a/test_conformance/pipes/test_pipe_read_write.cpp +++ b/test_conformance/pipes/test_pipe_read_write.cpp @@ -414,9 +414,9 @@ static int verify_readwrite_ulong(void *ptr1, void *ptr2, int n) static int verify_readwrite_double(void *ptr1, void *ptr2, int n) { int i; - long long int sum_input = 0, sum_output = 0; - long long int *inptr = (long long int *)ptr1; - long long int *outptr = (long long int *)ptr2; + cl_long sum_input = 0, sum_output = 0; + cl_long *inptr = (cl_long *)ptr1; + cl_long *outptr = (cl_long *)ptr2; for(i = 0; i < n; i++) { @@ -626,7 +626,6 @@ int test_pipe_readwrite_struct_generic( cl_device_id deviceID, cl_context contex size_t size = sizeof(TestStruct); size_t global_work_size[3]; cl_int err; - int total_errors = 0; int i; MTdataHolder d(gRandomSeed); clEventWrapper producer_sync_event = NULL; @@ -1076,7 +1075,8 @@ int test_pipe_readwrite_half( cl_device_id deviceID, cl_context context, cl_comm if(!is_extension_available(deviceID, "cl_khr_fp16")) { - log_info("cl_khr_fp16 is not supported on this platoform. Skipping test.\n"); + log_info( + "cl_khr_fp16 is not supported on this platform. Skipping test.\n"); return CL_SUCCESS; } ptrSizes[0] = sizeof(cl_float) / 2; @@ -1246,7 +1246,7 @@ int test_pipe_readwrite_double( cl_device_id deviceID, cl_context context, cl_co size_t min_alignment = get_min_alignment(context); - foo = verify_readwrite_long; + foo = verify_readwrite_double; ptrSizes[0] = sizeof(cl_double); ptrSizes[1] = ptrSizes[0] << 1; @@ -1257,7 +1257,8 @@ int test_pipe_readwrite_double( cl_device_id deviceID, cl_context context, cl_co //skip devices that don't support double if(!is_extension_available(deviceID, "cl_khr_fp64")) { - log_info("cl_khr_fp64 is not supported on this platoform. Skipping test.\n"); + log_info( + "cl_khr_fp64 is not supported on this platform. Skipping test.\n"); return CL_SUCCESS; } @@ -1404,7 +1405,8 @@ int test_pipe_subgroup_readwrite_int( cl_device_id deviceID, cl_context context, if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_int(deviceID, context, queue, num_elements); @@ -1418,7 +1420,8 @@ int test_pipe_subgroup_readwrite_uint( cl_device_id deviceID, cl_context context if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_uint(deviceID, context, queue, num_elements); @@ -1432,7 +1435,8 @@ int test_pipe_subgroup_readwrite_short( cl_device_id deviceID, cl_context contex if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_short(deviceID, context, queue, num_elements); @@ -1446,7 +1450,8 @@ int test_pipe_subgroup_readwrite_ushort( cl_device_id deviceID, cl_context conte if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_ushort(deviceID, context, queue, num_elements); @@ -1460,7 +1465,8 @@ int test_pipe_subgroup_readwrite_char( cl_device_id deviceID, cl_context context if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_char(deviceID, context, queue, num_elements); @@ -1474,7 +1480,8 @@ int test_pipe_subgroup_readwrite_uchar( cl_device_id deviceID, cl_context contex if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_uchar(deviceID, context, queue, num_elements); @@ -1489,7 +1496,8 @@ int test_pipe_subgroup_readwrite_float( cl_device_id deviceID, cl_context contex if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_float(deviceID, context, queue, num_elements); @@ -1503,7 +1511,8 @@ int test_pipe_subgroup_readwrite_half( cl_device_id deviceID, cl_context context if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_half(deviceID, context, queue, num_elements); @@ -1517,7 +1526,8 @@ int test_pipe_subgroup_readwrite_long( cl_device_id deviceID, cl_context context if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_long(deviceID, context, queue, num_elements); @@ -1531,7 +1541,8 @@ int test_pipe_subgroup_readwrite_ulong( cl_device_id deviceID, cl_context contex if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_ulong(deviceID, context, queue, num_elements); @@ -1545,7 +1556,8 @@ int test_pipe_subgroup_readwrite_double( cl_device_id deviceID, cl_context conte if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } return test_pipe_readwrite_double(deviceID, context, queue, num_elements); @@ -1555,7 +1567,8 @@ int test_pipe_subgroup_readwrite_struct( cl_device_id deviceID, cl_context conte { if(!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info("cl_khr_subgroups is not supported on this platoform. Skipping test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } const char *kernelNames[] = {"test_pipe_subgroup_write_struct","test_pipe_subgroup_read_struct"}; diff --git a/test_conformance/pipes/test_pipe_subgroups.cpp b/test_conformance/pipes/test_pipe_subgroups.cpp index b3e17183..8e2f6e57 100644 --- a/test_conformance/pipes/test_pipe_subgroups.cpp +++ b/test_conformance/pipes/test_pipe_subgroups.cpp @@ -114,9 +114,8 @@ int test_pipe_subgroups_divergence(cl_device_id deviceID, cl_context context, cl if (!is_extension_available(deviceID, "cl_khr_subgroups")) { - log_info( - "cl_khr_subgroups is not supported on this platoform. Skipping " - "test.\n"); + log_info("cl_khr_subgroups is not supported on this platform. Skipping " + "test.\n"); return CL_SUCCESS; } diff --git a/test_conformance/printf/test_printf.cpp b/test_conformance/printf/test_printf.cpp index 2b804e40..d638cd46 100644 --- a/test_conformance/printf/test_printf.cpp +++ b/test_conformance/printf/test_printf.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -232,10 +232,8 @@ int waitForEvent(cl_event* event) //----------------------------------------- static cl_program makePrintfProgram(cl_kernel *kernel_ptr, const cl_context context,const unsigned int testId,const unsigned int testNum,bool isLongSupport,bool is64bAddrSpace) { - int err,i; + int err; cl_program program; - cl_device_id devID; - char buildLog[ 1024 * 128 ]; char testname[256] = {0}; char addrSpaceArgument[256] = {0}; char addrSpacePAddArgument[256] = {0}; @@ -825,73 +823,75 @@ int test_address_space_4(cl_device_id deviceID, cl_context context, cl_command_q return doTest(gQueue, gContext, TYPE_ADDRESS_SPACE, 4, deviceID); } +int test_buffer_size(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) +{ + size_t printf_buff_size = 0; + const size_t printf_buff_size_req = !gIsEmbedded ? (1024 * 1024UL) : 1024UL; + const size_t config_size = sizeof(printf_buff_size); + cl_int err = CL_SUCCESS; + + err = clGetDeviceInfo(deviceID, CL_DEVICE_PRINTF_BUFFER_SIZE, config_size, + &printf_buff_size, NULL); + if (err != CL_SUCCESS) + { + log_error("Unable to query CL_DEVICE_PRINTF_BUFFER_SIZE"); + return TEST_FAIL; + } + + if (printf_buff_size < printf_buff_size_req) + { + log_error("CL_DEVICE_PRINTF_BUFFER_SIZE does not meet requirements"); + return TEST_FAIL; + } + + return TEST_PASS; +} + test_definition test_list[] = { - ADD_TEST( int_0 ), - ADD_TEST( int_1 ), - ADD_TEST( int_2 ), - ADD_TEST( int_3 ), - ADD_TEST( int_4 ), - ADD_TEST( int_5 ), - ADD_TEST( int_6 ), - ADD_TEST( int_7 ), - ADD_TEST( int_8 ), - - ADD_TEST( float_0 ), - ADD_TEST( float_1 ), - ADD_TEST( float_2 ), - ADD_TEST( float_3 ), - ADD_TEST( float_4 ), - ADD_TEST( float_5 ), - ADD_TEST( float_6 ), - ADD_TEST( float_7 ), - ADD_TEST( float_8 ), - ADD_TEST( float_9 ), - ADD_TEST( float_10 ), - ADD_TEST( float_11 ), - ADD_TEST( float_12 ), - ADD_TEST( float_13 ), - ADD_TEST( float_14 ), - ADD_TEST( float_15 ), - ADD_TEST( float_16 ), - ADD_TEST( float_17 ), - - ADD_TEST( float_limits_0 ), - ADD_TEST( float_limits_1 ), - ADD_TEST( float_limits_2 ), - - ADD_TEST( octal_0 ), - ADD_TEST( octal_1 ), - ADD_TEST( octal_2 ), - ADD_TEST( octal_3 ), - - ADD_TEST( unsigned_0 ), - ADD_TEST( unsigned_1 ), - - ADD_TEST( hexadecimal_0 ), - ADD_TEST( hexadecimal_1 ), - ADD_TEST( hexadecimal_2 ), - ADD_TEST( hexadecimal_3 ), - ADD_TEST( hexadecimal_4 ), - - ADD_TEST( char_0 ), - ADD_TEST( char_1 ), - ADD_TEST( char_2 ), - - ADD_TEST( string_0 ), - ADD_TEST( string_1 ), - ADD_TEST( string_2 ), - - ADD_TEST( vector_0 ), - ADD_TEST( vector_1 ), - ADD_TEST( vector_2 ), - ADD_TEST( vector_3 ), - ADD_TEST( vector_4 ), - - ADD_TEST( address_space_0 ), - ADD_TEST( address_space_1 ), - ADD_TEST( address_space_2 ), - ADD_TEST( address_space_3 ), - ADD_TEST( address_space_4 ), + ADD_TEST(int_0), ADD_TEST(int_1), + ADD_TEST(int_2), ADD_TEST(int_3), + ADD_TEST(int_4), ADD_TEST(int_5), + ADD_TEST(int_6), ADD_TEST(int_7), + ADD_TEST(int_8), + + ADD_TEST(float_0), ADD_TEST(float_1), + ADD_TEST(float_2), ADD_TEST(float_3), + ADD_TEST(float_4), ADD_TEST(float_5), + ADD_TEST(float_6), ADD_TEST(float_7), + ADD_TEST(float_8), ADD_TEST(float_9), + ADD_TEST(float_10), ADD_TEST(float_11), + ADD_TEST(float_12), ADD_TEST(float_13), + ADD_TEST(float_14), ADD_TEST(float_15), + ADD_TEST(float_16), ADD_TEST(float_17), + + ADD_TEST(float_limits_0), ADD_TEST(float_limits_1), + ADD_TEST(float_limits_2), + + ADD_TEST(octal_0), ADD_TEST(octal_1), + ADD_TEST(octal_2), ADD_TEST(octal_3), + + ADD_TEST(unsigned_0), ADD_TEST(unsigned_1), + + ADD_TEST(hexadecimal_0), ADD_TEST(hexadecimal_1), + ADD_TEST(hexadecimal_2), ADD_TEST(hexadecimal_3), + ADD_TEST(hexadecimal_4), + + ADD_TEST(char_0), ADD_TEST(char_1), + ADD_TEST(char_2), + + ADD_TEST(string_0), ADD_TEST(string_1), + ADD_TEST(string_2), + + ADD_TEST(vector_0), ADD_TEST(vector_1), + ADD_TEST(vector_2), ADD_TEST(vector_3), + ADD_TEST(vector_4), + + ADD_TEST(address_space_0), ADD_TEST(address_space_1), + ADD_TEST(address_space_2), ADD_TEST(address_space_3), + ADD_TEST(address_space_4), + + ADD_TEST(buffer_size), }; const int test_num = ARRAY_SIZE( test_list ); @@ -1030,8 +1030,6 @@ test_status InitCL( cl_device_id device ) return TEST_SKIP; } - log_info( "Test binary built %s %s\n", __DATE__, __TIME__ ); - gFd = acquireOutputStream(&err); if (err != 0) { diff --git a/test_conformance/printf/util_printf.cpp b/test_conformance/printf/util_printf.cpp index 3546c5f5..d45e1d43 100644 --- a/test_conformance/printf/util_printf.cpp +++ b/test_conformance/printf/util_printf.cpp @@ -842,8 +842,6 @@ static void hexRefBuilder(printDataGenParameters& params, char* refResult, const */ void generateRef(const cl_device_id device) { - int fd = -1; - char _refBuffer[ANALYSIS_BUFFER_SIZE]; const cl_device_fp_config fpConfig = get_default_rounding_mode(device); const RoundingMode hostRound = get_round(); RoundingMode deviceRound; diff --git a/test_conformance/profiling/execute.cpp b/test_conformance/profiling/execute.cpp index edfc043c..44b1bcd4 100644 --- a/test_conformance/profiling/execute.cpp +++ b/test_conformance/profiling/execute.cpp @@ -21,6 +21,8 @@ #include <sys/types.h> #include <sys/stat.h> +#include <algorithm> + #include "procs.h" #include "harness/testHarness.h" #include "harness/errorHelpers.h" @@ -29,12 +31,6 @@ typedef unsigned char uchar; #endif -#undef MIN -#define MIN(x,y) ( (x) < (y) ? (x) : (y) ) - -#undef MAX -#define MAX(x,y) ( (x) > (y) ? (x) : (y) ) - //#define CREATE_OUTPUT 1 extern int writePPM( const char *filename, uchar *buf, int xsize, int ysize ); @@ -73,8 +69,8 @@ static const char *image_filter_src = static void read_imagef( int x, int y, int w, int h, int nChannels, uchar *src, float *srcRgb ) { // clamp the coords - int x0 = MIN( MAX( x, 0 ), w - 1 ); - int y0 = MIN( MAX( y, 0 ), h - 1 ); + int x0 = std::min(std::max(x, 0), w - 1); + int y0 = std::min(std::max(y, 0), h - 1); // get tine index int indx = ( y0 * w + x0 ) * nChannels; @@ -339,8 +335,8 @@ static int kernelFilter( cl_device_id device, cl_context context, cl_command_que clReleaseMemObject( memobjs[1] ); clReleaseMemObject( memobjs[0] ); - if (check_times(queueStart, submitStart, writeStart, writeEnd, device)) - err = -1; + if (check_times(queueStart, submitStart, writeStart, writeEnd, device)) + err = -1; return err; diff --git a/test_conformance/profiling/writeImage.cpp b/test_conformance/profiling/writeImage.cpp index fbc8fbcd..ec2fbdaf 100644 --- a/test_conformance/profiling/writeImage.cpp +++ b/test_conformance/profiling/writeImage.cpp @@ -628,8 +628,8 @@ int write_image( cl_device_id device, cl_context context, cl_command_queue queue free( dst ); free( inptr ); - if (check_times(queueStart, submitStart, writeStart, writeEnd, device)) - err = -1; + if (check_times(queueStart, submitStart, writeStart, writeEnd, device)) + err = -1; return err; diff --git a/test_conformance/run_conformance.py b/test_conformance/run_conformance.py index ea7f6775..974491e1 100755 --- a/test_conformance/run_conformance.py +++ b/test_conformance/run_conformance.py @@ -8,295 +8,304 @@ #// #******************************************************************/ -import os, re, sys, subprocess, time, commands, tempfile, math, string +from __future__ import print_function + +import os +import re +import sys +import subprocess +import time +import tempfile DEBUG = 0 -log_file_name = "opencl_conformance_results_" + time.strftime("%Y-%m-%d_%H-%M", time.localtime())+ ".log" +log_file_name = "opencl_conformance_results_" + time.strftime("%Y-%m-%d_%H-%M", time.localtime()) + ".log" process_pid = 0 # The amount of time between printing a "." (if no output from test) or ":" (if output) # to the screen while the tests are running. -seconds_between_status_updates = 60*60*24*7 # effectively never +seconds_between_status_updates = 60 * 60 * 24 * 7 # effectively never # Help info -def write_help_info() : - print("run_conformance.py test_list [CL_DEVICE_TYPE(s) to test] [partial-test-names, ...] [log=path/to/log/file/]") - print(" test_list - the .csv file containing the test names and commands to run the tests.") - print(" [partial-test-names, ...] - optional partial strings to select a subset of the tests to run.") - print(" [CL_DEVICE_TYPE(s) to test] - list of CL device types to test, default is CL_DEVICE_TYPE_DEFAULT.") - print(" [log=path/to/log/file/] - provide a path for the test log file, default is in the current directory.") - print(" (Note: spaces are not allowed in the log file path.") +def write_help_info(): + print("run_conformance.py test_list [CL_DEVICE_TYPE(s) to test] [partial-test-names, ...] [log=path/to/log/file/]") + print(" test_list - the .csv file containing the test names and commands to run the tests.") + print(" [partial-test-names, ...] - optional partial strings to select a subset of the tests to run.") + print(" [CL_DEVICE_TYPE(s) to test] - list of CL device types to test, default is CL_DEVICE_TYPE_DEFAULT.") + print(" [log=path/to/log/file/] - provide a path for the test log file, default is in the current directory.") + print(" (Note: spaces are not allowed in the log file path.") # Get the time formatted nicely -def get_time() : - return time.strftime("%d-%b %H:%M:%S", time.localtime()) +def get_time(): + return time.strftime("%d-%b %H:%M:%S", time.localtime()) + # Write text to the screen and the log file -def write_screen_log(text) : - global log_file - print(text) - log_file.write(text+"\n") +def write_screen_log(text): + global log_file + print(text) + log_file.write(text + "\n") + # Load the tests from a csv formated file of the form name,command def get_tests(filename, devices_to_test): - tests = [] - if (os.path.exists(filename) == False): - print("FAILED: test_list \"" + filename + "\" does not exist.") - print("") - write_help_info() - sys.exit(-1) - file = open(filename, 'r') - for line in file.readlines(): - comment = re.search("^#.*", line) - if (comment): - continue - device_specific_match = re.search("^\s*(.+?)\s*,\s*(.+?)\s*,\s*(.+?)\s*$", line) - if (device_specific_match): - if (device_specific_match.group(1) in devices_to_test): - test_path = string.replace(device_specific_match.group(3), '/', os.sep) - test_name = string.replace(device_specific_match.group(2), '/', os.sep) - tests.append((test_name, test_path)) - else: - print("Skipping " + device_specific_match.group(2) + " because " + device_specific_match.group(1) + " is not in the list of devices to test.") - continue - match = re.search("^\s*(.+?)\s*,\s*(.+?)\s*$", line) - if (match): - test_path = string.replace(match.group(2), '/', os.sep) - test_name = string.replace(match.group(1), '/', os.sep) - tests.append((test_name, test_path)) - return tests + tests = [] + if os.path.exists(filename) == False: + print("FAILED: test_list \"" + filename + "\" does not exist.") + print("") + write_help_info() + sys.exit(-1) + file = open(filename, 'r') + for line in file.readlines(): + comment = re.search("^#.*", line) + if comment: + continue + device_specific_match = re.search("^\s*(.+?)\s*,\s*(.+?)\s*,\s*(.+?)\s*$", line) + if device_specific_match: + if device_specific_match.group(1) in devices_to_test: + test_path = str.replace(device_specific_match.group(3), '/', os.sep) + test_name = str.replace(device_specific_match.group(2), '/', os.sep) + tests.append((test_name, test_path)) + else: + print("Skipping " + device_specific_match.group(2) + " because " + device_specific_match.group(1) + " is not in the list of devices to test.") + continue + match = re.search("^\s*(.+?)\s*,\s*(.+?)\s*$", line) + if match: + test_path = str.replace(match.group(2), '/', os.sep) + test_name = str.replace(match.group(1), '/', os.sep) + tests.append((test_name, test_path)) + return tests def run_test_checking_output(current_directory, test_dir, log_file): - global process_pid, seconds_between_status_updates - failures_this_run = 0 - start_time = time.time() - # Create a temporary file for capturing the output from the test - (output_fd, output_name) = tempfile.mkstemp() - if ( not os.path.exists(output_name)) : - write_screen_log("\n ==> ERROR: could not create temporary file %s ." % output_name) - os.close(output_fd) - return -1 - # Execute the test - program_to_run = test_dir_without_args = test_dir.split(None, 1)[0] - if ( os.sep == '\\' ) : program_to_run += ".exe" - if (os.path.exists(current_directory + os.sep + program_to_run)) : - os.chdir(os.path.dirname(current_directory+os.sep+test_dir_without_args) ) - try: - if (DEBUG): p = subprocess.Popen("", stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) - else : p = subprocess.Popen(current_directory + os.sep + test_dir, stderr=output_fd, stdout=output_fd, shell=True) - except OSError: - write_screen_log("\n ==> ERROR: failed to execute test. Failing test. : " + str(OSError)) - os.close(output_fd) - return -1 - else: - write_screen_log("\n ==> ERROR: test file (" + current_directory + os.sep + program_to_run +") does not exist. Failing test.") - os.close(output_fd) - return -1 - # Set the global pid so we can kill it if this is aborted - process_pid = p.pid - # Read one character at a time from the temporary output file while the process is running. - # When we get an end-of-line, look for errors and write the results to the log file. - # This allows us to process the file as it is being produced. - # Keep track of the state for reading - # Whether we are done, if we have more to read, and where in the file we last read - done = False - more_to_read = True - pointer = 0 - pointer_at_last_user_update = 0 - output_this_run = False - try: - read_output = open(output_name, 'r') - except IOError: - write_screen_log("\n ==> ERROR: could not open output file from test.") - os.close(output_fd) - return -1 - line = "" - while (not done or more_to_read): - os.fsync(output_fd) - # Determine if we should display some output - elapsed_time = (time.time() - start_time) - if (elapsed_time > seconds_between_status_updates): - start_time = time.time() - # If we've received output from the test since the last update, display a # - if (pointer != pointer_at_last_user_update): - sys.stdout.write(":") - else: - sys.stdout.write(".") - pointer_at_last_user_update = pointer - sys.stdout.flush() - # Check if we're done - p.poll() - if (not done and p.returncode != None): - if (p.returncode < 0): - if (not output_this_run): - print "" - output_this_run = True - write_screen_log(" ==> ERROR: test killed/crashed: " + str(p.returncode)+ ".") - done = True - # Try reading + global process_pid, seconds_between_status_updates + failures_this_run = 0 + start_time = time.time() + # Create a temporary file for capturing the output from the test + (output_fd, output_name) = tempfile.mkstemp() + if not os.path.exists(output_name): + write_screen_log("\n ==> ERROR: could not create temporary file %s ." % output_name) + os.close(output_fd) + return -1 + # Execute the test + program_to_run = test_dir_without_args = test_dir.split(None, 1)[0] + if os.sep == '\\': + program_to_run += ".exe" + if os.path.exists(current_directory + os.sep + program_to_run): + os.chdir(os.path.dirname(current_directory + os.sep + test_dir_without_args)) + try: + if DEBUG: p = subprocess.Popen("", stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True) + else: p = subprocess.Popen(current_directory + os.sep + test_dir, stderr=output_fd, stdout=output_fd, shell=True) + except OSError: + write_screen_log("\n ==> ERROR: failed to execute test. Failing test. : " + str(OSError)) + os.close(output_fd) + return -1 + else: + write_screen_log("\n ==> ERROR: test file (" + current_directory + os.sep + program_to_run + ") does not exist. Failing test.") + os.close(output_fd) + return -1 + # Set the global pid so we can kill it if this is aborted + process_pid = p.pid + # Read one character at a time from the temporary output file while the process is running. + # When we get an end-of-line, look for errors and write the results to the log file. + # This allows us to process the file as it is being produced. + # Keep track of the state for reading + # Whether we are done, if we have more to read, and where in the file we last read + done = False + more_to_read = True + pointer = 0 + pointer_at_last_user_update = 0 + output_this_run = False try: - read_output.seek(pointer) - char_read = read_output.read(1) - except IOError: - time.sleep(1) - continue - # If we got a full line then process it - if (char_read == "\n"): - # Look for failures and report them as such - match = re.search(".*(FAILED|ERROR).*", line) - if (match): - if (not output_this_run): - print "" - output_this_run = True - print(" ==> " + line.replace('\n','')) - match = re.search(".*FAILED.*", line) - if (match): - failures_this_run = failures_this_run + 1 - match = re.search(".*(PASSED).*", line) - if (match): - if (not output_this_run): - print "" - output_this_run = True - print(" " + line.replace('\n','')) - # Write it to the log - log_file.write(" " + line +"\n") - log_file.flush() - line = "" - pointer = pointer + 1 - # If we are at the end of the file, then re-open it to get new data - elif (char_read == ""): - more_to_read = False - read_output.close() - time.sleep(1) - try: - os.fsync(output_fd) read_output = open(output_name, 'r') - # See if there is more to read. This happens if the process ends and we have data left. - read_output.seek(pointer) - if (read_output.read(1) != ""): - more_to_read = True - except IOError: - write_screen_log("\n ==> ERROR: could not reopen output file from test.") + except IOError: + write_screen_log("\n ==> ERROR: could not open output file from test.") + os.close(output_fd) return -1 - done = True - else: - line = line + char_read - pointer = pointer + 1 - # Now we are done, so write out any remaining data in the file: - # This should only happen if the process exited with an error. - os.fsync(output_fd) - while (read_output.read(1) != ""): - log_file.write(read_output.read(1)) - # Return the total number of failures - if (p.returncode == 0 and failures_this_run > 0): - write_screen_log("\n ==> ERROR: Test returned 0, but number of FAILED lines reported is " + str(failures_this_run) +".") - return failures_this_run - return p.returncode - - -def run_tests(tests) : - global curent_directory - global process_pid - # Run the tests - failures = 0 - previous_test = None - test_number = 1 - for test in tests: - # Print the name of the test we're running and the time - (test_name, test_dir) = test - if (test_dir != previous_test): - print("========== " + test_dir) - log_file.write("========================================================================================\n") - log_file.write("========================================================================================\n") - log_file.write("(" + get_time() + ") Running Tests: " + test_dir +"\n") - log_file.write("========================================================================================\n") - log_file.write("========================================================================================\n") - previous_test = test_dir - print("("+get_time()+") BEGIN " + test_name.ljust(40) +": "), - log_file.write(" ----------------------------------------------------------------------------------------\n") - log_file.write(" (" + get_time() + ") Running Sub Test: " + test_name + "\n") - log_file.write(" ----------------------------------------------------------------------------------------\n") - log_file.flush() - sys.stdout.flush() - - # Run the test - result = 0 - start_time = time.time() - try: - process_pid = 0 - result = run_test_checking_output(current_directory, test_dir, log_file) - except KeyboardInterrupt: - # Catch an interrupt from the user - write_screen_log("\nFAILED: Execution interrupted. Killing test process, but not aborting full test run.") - os.kill(process_pid, 9) - answer = raw_input("Abort all tests? (y/n)") - if (answer.find("y") != -1): - write_screen_log("\nUser chose to abort all tests.") - log_file.close() - sys.exit(-1) - else: - write_screen_log("\nUser chose to continue with other tests. Reporting this test as failed.") - result = 1 - run_time = (time.time() - start_time) - - # Move print the finish status - if (result == 0): - print("("+get_time()+") PASSED " + test_name.ljust(40) +": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) +")"), - else: - print("("+get_time()+") FAILED " + test_name.ljust(40) +": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) +")"), - - test_number = test_number + 1 - log_file.write(" ----------------------------------------------------------------------------------------\n") - log_file.flush() - - print("") - if (result != 0): - log_file.write(" *******************************************************************************************\n") - log_file.write(" * ("+get_time()+") Test " + test_name + " ==> FAILED: " + str(result)+"\n") - log_file.write(" *******************************************************************************************\n") - failures = failures + 1 - else: - log_file.write(" ("+get_time()+") Test " + test_name +" passed in " + str(run_time) + "s\n") - - log_file.write(" ----------------------------------------------------------------------------------------\n") - log_file.write("\n") - return failures - - - + line = "" + while not done or more_to_read: + os.fsync(output_fd) + # Determine if we should display some output + elapsed_time = (time.time() - start_time) + if elapsed_time > seconds_between_status_updates: + start_time = time.time() + # If we've received output from the test since the last update, display a # + if pointer != pointer_at_last_user_update: + sys.stdout.write(":") + else: + sys.stdout.write(".") + pointer_at_last_user_update = pointer + sys.stdout.flush() + # Check if we're done + p.poll() + if not done and p.returncode != None: + if p.returncode < 0: + if not output_this_run: + print("") + output_this_run = True + write_screen_log(" ==> ERROR: test killed/crashed: " + str(p.returncode) + ".") + done = True + # Try reading + try: + read_output.seek(pointer) + char_read = read_output.read(1) + except IOError: + time.sleep(1) + continue + # If we got a full line then process it + if char_read == "\n": + # Look for failures and report them as such + match = re.search(".*(FAILED|ERROR).*", line) + if match: + if not output_this_run: + print("") + output_this_run = True + print(" ==> " + line.replace('\n', '')) + match = re.search(".*FAILED.*", line) + if match: + failures_this_run = failures_this_run + 1 + match = re.search(".*(PASSED).*", line) + if match: + if not output_this_run: + print("") + output_this_run = True + print(" " + line.replace('\n', '')) + # Write it to the log + log_file.write(" " + line + "\n") + log_file.flush() + line = "" + pointer = pointer + 1 + # If we are at the end of the file, then re-open it to get new data + elif char_read == "": + more_to_read = False + read_output.close() + time.sleep(1) + try: + os.fsync(output_fd) + read_output = open(output_name, 'r') + # See if there is more to read. This happens if the process ends and we have data left. + read_output.seek(pointer) + if read_output.read(1) != "": + more_to_read = True + except IOError: + write_screen_log("\n ==> ERROR: could not reopen output file from test.") + return -1 + else: + line = line + char_read + pointer = pointer + 1 + # Now we are done, so write out any remaining data in the file: + # This should only happen if the process exited with an error. + os.fsync(output_fd) + while read_output.read(1) != "": + log_file.write(read_output.read(1)) + # Return the total number of failures + if (p.returncode == 0 and failures_this_run > 0): + write_screen_log("\n ==> ERROR: Test returned 0, but number of FAILED lines reported is " + str(failures_this_run) + ".") + return failures_this_run + return p.returncode + + +def run_tests(tests): + global curent_directory + global process_pid + # Run the tests + failures = 0 + previous_test = None + test_number = 1 + for test in tests: + # Print the name of the test we're running and the time + (test_name, test_dir) = test + if test_dir != previous_test: + print("========== " + test_dir) + log_file.write("========================================================================================\n") + log_file.write("========================================================================================\n") + log_file.write("(" + get_time() + ") Running Tests: " + test_dir + "\n") + log_file.write("========================================================================================\n") + log_file.write("========================================================================================\n") + previous_test = test_dir + print("(" + get_time() + ") BEGIN " + test_name.ljust(40) + ": ", end='') + log_file.write(" ----------------------------------------------------------------------------------------\n") + log_file.write(" (" + get_time() + ") Running Sub Test: " + test_name + "\n") + log_file.write(" ----------------------------------------------------------------------------------------\n") + log_file.flush() + sys.stdout.flush() + + # Run the test + result = 0 + start_time = time.time() + try: + process_pid = 0 + result = run_test_checking_output(current_directory, test_dir, log_file) + except KeyboardInterrupt: + # Catch an interrupt from the user + write_screen_log("\nFAILED: Execution interrupted. Killing test process, but not aborting full test run.") + os.kill(process_pid, 9) + if sys.version_info[0] < 3: + answer = raw_input("Abort all tests? (y/n)") + else: + answer = input("Abort all tests? (y/n)") + if answer.find("y") != -1: + write_screen_log("\nUser chose to abort all tests.") + log_file.close() + sys.exit(-1) + else: + write_screen_log("\nUser chose to continue with other tests. Reporting this test as failed.") + result = 1 + run_time = (time.time() - start_time) + + # Move print the finish status + if result == 0: + print("(" + get_time() + ") PASSED " + test_name.ljust(40) + ": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) + ")", end='') + else: + print("(" + get_time() + ") FAILED " + test_name.ljust(40) + ": (" + str(int(run_time)).rjust(3) + "s, test " + str(test_number).rjust(3) + os.sep + str(len(tests)) + ")", end='') + + test_number = test_number + 1 + log_file.write(" ----------------------------------------------------------------------------------------\n") + log_file.flush() + + print("") + if result != 0: + log_file.write(" *******************************************************************************************\n") + log_file.write(" * (" + get_time() + ") Test " + test_name + " ==> FAILED: " + str(result) + "\n") + log_file.write(" *******************************************************************************************\n") + failures = failures + 1 + else: + log_file.write(" (" + get_time() + ") Test " + test_name + " passed in " + str(run_time) + "s\n") + + log_file.write(" ----------------------------------------------------------------------------------------\n") + log_file.write("\n") + return failures # ######################## # Begin OpenCL conformance run script # ######################## -if (len(sys.argv) < 2): - write_help_info() - sys.exit(-1) - +if len(sys.argv) < 2: + write_help_info() + sys.exit(-1) current_directory = os.getcwd() # Open the log file for arg in sys.argv: - match = re.search("log=(\S+)", arg) - if (match): - log_file_name = match.group(1).rstrip('/') + os.sep + log_file_name + match = re.search("log=(\S+)", arg) + if match: + log_file_name = match.group(1).rstrip('/') + os.sep + log_file_name try: - log_file = open(log_file_name, "w") + log_file = open(log_file_name, "w") except IOError: - print "Could not open log file " + log_file_name + print("Could not open log file " + log_file_name) + sys.exit(-1) # Determine which devices to test device_types = ["CL_DEVICE_TYPE_DEFAULT", "CL_DEVICE_TYPE_CPU", "CL_DEVICE_TYPE_GPU", "CL_DEVICE_TYPE_ACCELERATOR", "CL_DEVICE_TYPE_ALL"] devices_to_test = [] for device in device_types: - if device in sys.argv[2:]: - devices_to_test.append(device) -if (len(devices_to_test) == 0): - devices_to_test = ["CL_DEVICE_TYPE_DEFAULT"] + if device in sys.argv[2:]: + devices_to_test.append(device) +if len(devices_to_test) == 0: + devices_to_test = ["CL_DEVICE_TYPE_DEFAULT"] write_screen_log("Testing on: " + str(devices_to_test)) # Get the tests @@ -306,52 +315,52 @@ tests = get_tests(sys.argv[1], devices_to_test) tests_to_use = [] num_of_patterns_to_match = 0 for arg in sys.argv[2:]: - if arg in device_types: - continue - if re.search("log=(\S+)", arg): - continue - num_of_patterns_to_match = num_of_patterns_to_match + 1 - found_it = False - for test in tests: - (test_name, test_dir) = test - if (test_name.find(arg) != -1 or test_dir.find(arg) != -1): - found_it = True - if (test not in tests_to_use): - tests_to_use.append(test) - if (found_it == False): - print("Failed to find a test matching " + arg) -if (len(tests_to_use) == 0): - if (num_of_patterns_to_match > 0): - print("FAILED: Failed to find any tests matching the given command-line options.") - print("") - write_help_info() - sys.exit(-1) + if arg in device_types: + continue + if re.search("log=(\S+)", arg): + continue + num_of_patterns_to_match = num_of_patterns_to_match + 1 + found_it = False + for test in tests: + (test_name, test_dir) = test + if (test_name.find(arg) != -1 or test_dir.find(arg) != -1): + found_it = True + if test not in tests_to_use: + tests_to_use.append(test) + if found_it == False: + print("Failed to find a test matching " + arg) +if len(tests_to_use) == 0: + if num_of_patterns_to_match > 0: + print("FAILED: Failed to find any tests matching the given command-line options.") + print("") + write_help_info() + sys.exit(-1) else: - tests = tests_to_use[:] + tests = tests_to_use[:] write_screen_log("Test execution arguments: " + str(sys.argv)) -write_screen_log("Logging to file " + log_file_name +".") +write_screen_log("Logging to file " + log_file_name + ".") write_screen_log("Loaded tests from " + sys.argv[1] + ", total of " + str(len(tests)) + " tests selected to run:") for (test_name, test_command) in tests: - write_screen_log(test_name.ljust(50) + " (" + test_command +")") + write_screen_log(test_name.ljust(50) + " (" + test_command + ")") # Run the tests total_failures = 0 for device_to_test in devices_to_test: - os.environ['CL_DEVICE_TYPE'] = device_to_test - write_screen_log("========================================================================================") - write_screen_log("========================================================================================") - write_screen_log(("Setting CL_DEVICE_TYPE to " + device_to_test).center(90)) - write_screen_log("========================================================================================") - write_screen_log("========================================================================================") - failures = run_tests(tests) - write_screen_log("========================================================================================") - if (failures == 0): - write_screen_log(">> TEST on " + device_to_test + " PASSED") - else: - write_screen_log(">> TEST on " + device_to_test + " FAILED (" + str(failures) + " FAILURES)") - write_screen_log("========================================================================================") - total_failures = total_failures + failures - -write_screen_log("("+get_time()+") Testing complete. " + str(total_failures) + " failures for " + str(len(tests)) + " tests.") + os.environ['CL_DEVICE_TYPE'] = device_to_test + write_screen_log("========================================================================================") + write_screen_log("========================================================================================") + write_screen_log(("Setting CL_DEVICE_TYPE to " + device_to_test).center(90)) + write_screen_log("========================================================================================") + write_screen_log("========================================================================================") + failures = run_tests(tests) + write_screen_log("========================================================================================") + if failures == 0: + write_screen_log(">> TEST on " + device_to_test + " PASSED") + else: + write_screen_log(">> TEST on " + device_to_test + " FAILED (" + str(failures) + " FAILURES)") + write_screen_log("========================================================================================") + total_failures = total_failures + failures + +write_screen_log("(" + get_time() + ") Testing complete. " + str(total_failures) + " failures for " + str(len(tests)) + " tests.") log_file.close() diff --git a/test_conformance/select/test_select.cpp b/test_conformance/select/test_select.cpp index 35f154ac..972a53c6 100644 --- a/test_conformance/select/test_select.cpp +++ b/test_conformance/select/test_select.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -79,7 +79,6 @@ static int s_wimpy_reduction_factor = 256; // sub tests which is for each individual test. The following // tracks the subtests int s_test_cnt = 0; -int s_test_fail = 0; //----------------------------------------- // Static helper functions @@ -174,8 +173,6 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont char extension[128] = ""; int err = 0; - int i; // generic, re-usable loop variable - const char *source[] = { extension, "__kernel void ", testname, @@ -297,6 +294,7 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr, const cl_context cont static int doTest(cl_command_queue queue, cl_context context, Type stype, Type cmptype, cl_device_id device) { int err = CL_SUCCESS; + int s_test_fail = 0; MTdata d; const size_t element_count[VECTOR_SIZE_COUNT] = { 1, 2, 3, 4, 8, 16 }; cl_mem src1 = NULL; @@ -468,6 +466,11 @@ exit: clReleaseProgram(programs[vecsize]); } ++s_test_cnt; + if (s_test_fail) + { + err = TEST_FAIL; + gFailCount++; + } return err; } @@ -636,7 +639,6 @@ int main(int argc, const char* argv[]) s_wimpy_mode = true; } - log_info( "Test binary built %s %s\n", __DATE__, __TIME__ ); if (s_wimpy_mode) { log_info("\n"); log_info("*** WARNING: Testing in Wimpy mode! ***\n"); @@ -665,4 +667,3 @@ static void printUsage( void ) log_info( "\t%s\n", test_list[i].name ); } } - diff --git a/test_conformance/spir/main.cpp b/test_conformance/spir/main.cpp index 3a18988c..06caf33b 100644 --- a/test_conformance/spir/main.cpp +++ b/test_conformance/spir/main.cpp @@ -6615,40 +6615,45 @@ struct sub_suite }; static const sub_suite spir_suites[] = { - {"api", "api", test_api}, - {"api_double", "api", test_api_double}, - {"atomics", "atomics", test_atomics}, - {"basic", "basic", test_basic}, - {"basic_double", "basic", test_basic_double}, - {"commonfns", "commonfns", test_commonfns}, - {"commonfns_double", "commonfns", test_commonfns_double}, - {"conversions", "conversions", test_conversions}, - {"conversions_double", "conversions", test_conversions_double}, - {"geometrics", "geometrics", test_geometrics}, - {"geometrics_double", "geometrics", test_geometrics_double}, - {"half", "half", test_half}, - {"half_double", "half", test_half_double}, - {"kernel_image_methods", "kernel_image_methods", test_kernel_image_methods}, - {"images_kernel_read_write", "images_kernel_read_write", test_images_kernel_read_write}, - {"images_samplerlessRead", "images_samplerlessRead", test_images_samplerless_read}, - {"integer_ops", "integer_ops", test_integer_ops}, - {"math_brute_force", "math_brute_force", test_math_brute_force}, - {"math_brute_force_double", "math_brute_force", test_math_brute_force_double}, - {"printf", "printf", test_printf}, - {"profiling", "profiling", test_profiling}, - {"relationals", "relationals", test_relationals}, - {"relationals_double", "relationals", test_relationals_double}, - {"select", "select", test_select}, - {"select_double", "select", test_select_double}, - {"vec_align", "vec_align", test_vec_align}, - {"vec_align_double", "vec_align", test_vec_align_double}, - {"vec_step", "vec_step", test_vec_step}, - {"vec_step_double", "vec_step", test_vec_step_double}, - {"compile_and_link", "compile_and_link", test_compile_and_link}, - {"sampler_enumeration", "sampler_enumeration", test_sampler_enumeration}, - {"enum_values", "enum_values", test_enum_values}, - {"kernel_attributes", "kernel_attributes", test_kernel_attributes}, - {"binary_type", "binary_type", test_binary_type}, + { "api", "api", test_api }, + { "api_double", "api", test_api_double }, + { "atomics", "atomics", test_atomics }, + { "basic", "basic", test_basic }, + { "basic_double", "basic", test_basic_double }, + { "commonfns", "commonfns", test_commonfns }, + { "commonfns_double", "commonfns", test_commonfns_double }, + { "conversions", "conversions", test_conversions }, + { "conversions_double", "conversions", test_conversions_double }, + { "geometrics", "geometrics", test_geometrics }, + { "geometrics_double", "geometrics", test_geometrics_double }, + { "half", "half", test_half }, + { "half_double", "half", test_half_double }, + { "kernel_image_methods", "kernel_image_methods", + test_kernel_image_methods }, + { "images_kernel_read_write", "images_kernel_read_write", + test_images_kernel_read_write }, + { "images_samplerlessRead", "images_samplerlessRead", + test_images_samplerless_read }, + { "integer_ops", "integer_ops", test_integer_ops }, + { "math_brute_force", "math_brute_force", test_math_brute_force }, + { "math_brute_force_double", "math_brute_force", + test_math_brute_force_double }, + { "printf", "printf", test_printf }, + { "profiling", "profiling", test_profiling }, + { "relationals", "relationals", test_relationals }, + { "relationals_double", "relationals", test_relationals_double }, + { "select", "select", test_select }, + { "select_double", "select", test_select_double }, + { "vec_align", "vec_align", test_vec_align }, + { "vec_align_double", "vec_align", test_vec_align_double }, + { "vec_step", "vec_step", test_vec_step }, + { "vec_step_double", "vec_step", test_vec_step_double }, + { "compile_and_link", "compile_and_link", test_compile_and_link }, + { "sampler_enumeration", "sampler_enumeration", test_sampler_enumeration }, + { "enum_values", "enum_values", test_enum_values }, + // {"kernel_attributes", "kernel_attributes", + // test_kernel_attributes}, // disabling temporarily, see GitHub #1284 + { "binary_type", "binary_type", test_binary_type }, }; diff --git a/test_conformance/spir/run_services.cpp b/test_conformance/spir/run_services.cpp index 06fc418d..6e06d53c 100644 --- a/test_conformance/spir/run_services.cpp +++ b/test_conformance/spir/run_services.cpp @@ -213,7 +213,6 @@ cl_kernel create_kernel_helper( cl_program program, const std::string& kernel_na { int error = CL_SUCCESS; cl_kernel kernel = NULL; - cl_device_id device = get_program_device(program); /* And create a kernel from it */ kernel = clCreateKernel( program, kernel_name.c_str(), &error ); if( kernel == NULL || error != CL_SUCCESS) @@ -389,6 +388,7 @@ OclExtensions OclExtensions::getDeviceCapabilities(cl_device_id devId) { ret = ret | OclExtensions::fromString(*it); } + return ret; } @@ -399,75 +399,80 @@ OclExtensions OclExtensions::empty() OclExtensions OclExtensions::fromString(const std::string& e) { - std::string s = "OclExtensions::" + e; - RETURN_IF_ENUM(s, OclExtensions::cl_khr_int64_base_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_int64_extended_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_3d_image_writes); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_fp16); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_sharing); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_event); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_d3d10_sharing); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_dx9_media_sharing); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_d3d11_sharing); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_depth_images); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_depth_images); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_gl_msaa_sharing); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_image2d_from_buffer); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_initialize_memory); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_spir); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_fp64); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_global_int32_base_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_global_int32_extended_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_local_int32_base_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_local_int32_extended_atomics); - RETURN_IF_ENUM(s, OclExtensions::cl_khr_byte_addressable_store); - RETURN_IF_ENUM(s, OclExtensions::cles_khr_int64); - RETURN_IF_ENUM(s, OclExtensions::cles_khr_2d_image_array_writes); + std::string s = "OclExtensions::has_" + e; + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_int64_base_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_int64_extended_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_3d_image_writes); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_fp16); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_sharing); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_event); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_d3d10_sharing); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_dx9_media_sharing); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_d3d11_sharing); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_depth_images); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_depth_images); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_gl_msaa_sharing); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_image2d_from_buffer); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_initialize_memory); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_spir); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_fp64); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_global_int32_base_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_global_int32_extended_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_local_int32_base_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_local_int32_extended_atomics); + RETURN_IF_ENUM(s, OclExtensions::has_cl_khr_byte_addressable_store); + RETURN_IF_ENUM(s, OclExtensions::has_cles_khr_int64); + RETURN_IF_ENUM(s, OclExtensions::has_cles_khr_2d_image_array_writes); // Unknown KHR string. return OclExtensions::empty(); } std::string OclExtensions::toString() { - - #define APPEND_STR_IF_SUPPORTS( STR, E) \ - if ( this->supports(E) ) \ - { \ - std::string ext_str( #E ); \ - std::string prefix = "OclExtensions::"; \ - size_t pos = ext_str.find( prefix ); \ - if ( pos != std::string::npos ) \ - { \ - ext_str.replace( pos, prefix.length(), ""); \ - } \ - STR += ext_str; \ - } +#define APPEND_STR_IF_SUPPORTS(STR, E) \ + if (this->supports(E)) \ + { \ + std::string ext_str(#E); \ + std::string prefix = "OclExtensions::has_"; \ + size_t pos = ext_str.find(prefix); \ + if (pos != std::string::npos) \ + { \ + ext_str.replace(pos, prefix.length(), ""); \ + } \ + STR += ext_str; \ + STR += " "; \ + } std::string s = ""; - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_int64_base_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_int64_extended_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_3d_image_writes ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_fp16 ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_sharing ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_event ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_d3d10_sharing ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_dx9_media_sharing ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_d3d11_sharing ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_depth_images ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_depth_images ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_gl_msaa_sharing ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_image2d_from_buffer ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_initialize_memory ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_spir ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_fp64 ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_global_int32_base_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_global_int32_extended_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_local_int32_base_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_local_int32_extended_atomics ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cl_khr_byte_addressable_store ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cles_khr_int64 ); - APPEND_STR_IF_SUPPORTS( s, OclExtensions::cles_khr_2d_image_array_writes ); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_int64_base_atomics); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_int64_extended_atomics); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_3d_image_writes); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_fp16); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_sharing); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_event); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_d3d10_sharing); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_dx9_media_sharing); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_d3d11_sharing); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_depth_images); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_depth_images); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_gl_msaa_sharing); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_image2d_from_buffer); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_initialize_memory); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_spir); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_fp64); + APPEND_STR_IF_SUPPORTS(s, + OclExtensions::has_cl_khr_global_int32_base_atomics); + APPEND_STR_IF_SUPPORTS( + s, OclExtensions::has_cl_khr_global_int32_extended_atomics); + APPEND_STR_IF_SUPPORTS(s, + OclExtensions::has_cl_khr_local_int32_base_atomics); + APPEND_STR_IF_SUPPORTS( + s, OclExtensions::has_cl_khr_local_int32_extended_atomics); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cl_khr_byte_addressable_store); + APPEND_STR_IF_SUPPORTS(s, OclExtensions::has_cles_khr_int64); + APPEND_STR_IF_SUPPORTS(s, + OclExtensions::has_cles_khr_2d_image_array_writes); return s; } diff --git a/test_conformance/spir/run_services.h b/test_conformance/spir/run_services.h index 6bac4c91..10f0d05e 100644 --- a/test_conformance/spir/run_services.h +++ b/test_conformance/spir/run_services.h @@ -113,42 +113,33 @@ private: OclExtensions(size_t ext) : m_extVector(ext) {} -// Fix a compilation error, since cl_khr_gl_sharing is defined as a macro. -#ifdef cl_khr_gl_sharing -#undef cl_khr_gl_sharing -#endif//cl_khr_gl_sharing - -#ifdef cl_khr_icd -#undef cl_khr_icd -#endif//cl_khr_icd - enum ClKhrs { - no_extensions = KhrValue<0>::Mask, - cl_khr_int64_base_atomics = KhrValue<1>::Mask, - cl_khr_int64_extended_atomics = KhrValue<2>::Mask, - cl_khr_3d_image_writes = KhrValue<3>::Mask, - cl_khr_fp16 = KhrValue<4>::Mask, - cl_khr_gl_sharing = KhrValue<5>::Mask, - cl_khr_gl_event = KhrValue<6>::Mask, - cl_khr_d3d10_sharing = KhrValue<7>::Mask, - cl_khr_dx9_media_sharing = KhrValue<8>::Mask, - cl_khr_d3d11_sharing = KhrValue<9>::Mask, - cl_khr_depth_images = KhrValue<10>::Mask, - cl_khr_gl_depth_images = KhrValue<11>::Mask, - cl_khr_gl_msaa_sharing = KhrValue<12>::Mask, - cl_khr_image2d_from_buffer = KhrValue<13>::Mask, - cl_khr_initialize_memory = KhrValue<14>::Mask, - cl_khr_context_abort = KhrValue<15>::Mask, - cl_khr_spir = KhrValue<16>::Mask, - cl_khr_fp64 = KhrValue<17>::Mask, - cl_khr_global_int32_base_atomics = KhrValue<18>::Mask, - cl_khr_global_int32_extended_atomics = KhrValue<19>::Mask, - cl_khr_local_int32_base_atomics = KhrValue<20>::Mask, - cl_khr_local_int32_extended_atomics = KhrValue<21>::Mask, - cl_khr_byte_addressable_store = KhrValue<22>::Mask, - cles_khr_int64 = KhrValue<23>::Mask, - cles_khr_2d_image_array_writes = KhrValue<24>::Mask, + no_extensions = KhrValue<0>::Mask, + has_cl_khr_int64_base_atomics = KhrValue<1>::Mask, + has_cl_khr_int64_extended_atomics = KhrValue<2>::Mask, + has_cl_khr_3d_image_writes = KhrValue<3>::Mask, + has_cl_khr_fp16 = KhrValue<4>::Mask, + has_cl_khr_gl_sharing = KhrValue<5>::Mask, + has_cl_khr_gl_event = KhrValue<6>::Mask, + has_cl_khr_d3d10_sharing = KhrValue<7>::Mask, + has_cl_khr_dx9_media_sharing = KhrValue<8>::Mask, + has_cl_khr_d3d11_sharing = KhrValue<9>::Mask, + has_cl_khr_depth_images = KhrValue<10>::Mask, + has_cl_khr_gl_depth_images = KhrValue<11>::Mask, + has_cl_khr_gl_msaa_sharing = KhrValue<12>::Mask, + has_cl_khr_image2d_from_buffer = KhrValue<13>::Mask, + has_cl_khr_initialize_memory = KhrValue<14>::Mask, + has_cl_khr_context_abort = KhrValue<15>::Mask, + has_cl_khr_spir = KhrValue<16>::Mask, + has_cl_khr_fp64 = KhrValue<17>::Mask, + has_cl_khr_global_int32_base_atomics = KhrValue<18>::Mask, + has_cl_khr_global_int32_extended_atomics = KhrValue<19>::Mask, + has_cl_khr_local_int32_base_atomics = KhrValue<20>::Mask, + has_cl_khr_local_int32_extended_atomics = KhrValue<21>::Mask, + has_cl_khr_byte_addressable_store = KhrValue<22>::Mask, + has_cles_khr_int64 = KhrValue<23>::Mask, + has_cles_khr_2d_image_array_writes = KhrValue<24>::Mask, }; size_t m_extVector; diff --git a/test_conformance/spir/sampler_enumeration.zip b/test_conformance/spir/sampler_enumeration.zip Binary files differindex 5f8a7a06..ab9c9a56 100644 --- a/test_conformance/spir/sampler_enumeration.zip +++ b/test_conformance/spir/sampler_enumeration.zip diff --git a/test_conformance/spirv_new/main.cpp b/test_conformance/spirv_new/main.cpp index 5a8664b6..41566837 100644 --- a/test_conformance/spirv_new/main.cpp +++ b/test_conformance/spirv_new/main.cpp @@ -203,7 +203,6 @@ int get_program_with_il(clProgramWrapper &prog, const cl_device_id deviceID, test_status InitCL(cl_device_id id) { test_status spirv_status; - bool force = true; spirv_status = check_spirv_compilation_readiness(id); if (spirv_status != TEST_PASS) { diff --git a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp index 9e1789c2..0728ea03 100644 --- a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp +++ b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp @@ -1,219 +1,218 @@ -/******************************************************************
-Copyright (c) 2018 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
-
-#include "testBase.h"
-#include "types.hpp"
-
-#include <sstream>
-#include <string>
-#include <type_traits>
-
-
-template<typename T>
-int test_ext_cl_khr_spirv_no_integer_wrap_decoration(cl_device_id deviceID,
- cl_context context,
- cl_command_queue queue,
- const char *spvName,
- const char *funcName,
- const char *Tname)
-{
-
- cl_int err = CL_SUCCESS;
- const int num = 10;
- std::vector<T> h_lhs(num);
- std::vector<T> h_rhs(num);
- std::vector<T> expected_results(num);
- std::vector<T> h_ref(num);
- if (!is_extension_available(deviceID, "cl_khr_spirv_no_integer_wrap_decoration")) {
- log_info("Extension cl_khr_spirv_no_integer_wrap_decoration not supported; skipping tests.\n");
- return 0;
- }
-
- /*Test with some values that do not cause overflow*/
- if (std::is_signed<T>::value == true) {
- h_lhs.push_back((T)-25000);
- h_lhs.push_back((T)-3333);
- h_lhs.push_back((T)-7);
- h_lhs.push_back((T)-1);
- h_lhs.push_back(0);
- h_lhs.push_back(1);
- h_lhs.push_back(1024);
- h_lhs.push_back(2048);
- h_lhs.push_back(4094);
- h_lhs.push_back(10000);
- } else {
- h_lhs.push_back(0);
- h_lhs.push_back(1);
- h_lhs.push_back(3);
- h_lhs.push_back(5);
- h_lhs.push_back(10);
- h_lhs.push_back(100);
- h_lhs.push_back(1024);
- h_lhs.push_back(2048);
- h_lhs.push_back(4094);
- h_lhs.push_back(52888);
- }
-
- h_rhs.push_back(0);
- h_rhs.push_back(1);
- h_rhs.push_back(2);
- h_rhs.push_back(3);
- h_rhs.push_back(4);
- h_rhs.push_back(5);
- h_rhs.push_back(6);
- h_rhs.push_back(7);
- h_rhs.push_back(8);
- h_rhs.push_back(9);
- size_t bytes = num * sizeof(T);
-
- clMemWrapper lhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);
- SPIRV_CHECK_ERROR(err, "Failed to create lhs buffer");
-
- err = clEnqueueWriteBuffer(queue, lhs, CL_TRUE, 0, bytes, &h_lhs[0], 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to copy to lhs buffer");
-
- clMemWrapper rhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err);
- SPIRV_CHECK_ERROR(err, "Failed to create rhs buffer");
-
- err = clEnqueueWriteBuffer(queue, rhs, CL_TRUE, 0, bytes, &h_rhs[0], 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to copy to rhs buffer");
-
- std::string kernelStr;
-
- {
- std::stringstream kernelStream;
- kernelStream << "#define spirv_fadd(a, b) (a) + (b) \n";
- kernelStream << "#define spirv_fsub(a, b) (a) - (b) \n";
- kernelStream << "#define spirv_fmul(a, b) (a) * (b) \n";
- kernelStream << "#define spirv_fshiftleft(a, b) (a) << (b) \n";
- kernelStream << "#define spirv_fnegate(a, b) (-a) \n";
-
- kernelStream << "#define T " << Tname << "\n";
- kernelStream << "#define FUNC spirv_" << funcName << "\n";
- kernelStream << "__kernel void fmath_cl(__global T *out, \n";
- kernelStream << "const __global T *lhs, const __global T *rhs) \n";
- kernelStream << "{ \n";
- kernelStream << " int id = get_global_id(0); \n";
- kernelStream << " out[id] = FUNC(lhs[id], rhs[id]); \n";
- kernelStream << "} \n";
- kernelStr = kernelStream.str();
- }
-
- size_t kernelLen = kernelStr.size();
- const char *kernelBuf = kernelStr.c_str();
-
- for (int i = 0; i < num; i++) {
- if (std::string(funcName) == std::string("fadd")) {
- expected_results[i] = h_lhs[i] + h_rhs[i];
- } else if (std::string(funcName) == std::string("fsub")) {
- expected_results[i] = h_lhs[i] - h_rhs[i];
- } else if (std::string(funcName) == std::string("fmul")) {
- expected_results[i] = h_lhs[i] * h_rhs[i];
- } else if (std::string(funcName) == std::string("fshiftleft")) {
- expected_results[i] = h_lhs[i] << h_rhs[i];
- } else if (std::string(funcName) == std::string("fnegate")) {
- expected_results[i] = 0 - h_lhs[i];
- }
- }
-
- {
- // Run the cl kernel for reference results
- clProgramWrapper prog;
- clKernelWrapper kernel;
- err = create_single_kernel_helper(context, &prog, &kernel, 1,
- &kernelBuf, "fmath_cl");
- SPIRV_CHECK_ERROR(err, "Failed to create cl kernel");
-
- clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
- SPIRV_CHECK_ERROR(err, "Failed to create ref buffer");
-
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ref);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 0");
-
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 1");
-
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 2");
-
- size_t global = num;
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");
-
- err = clEnqueueReadBuffer(queue, ref, CL_TRUE, 0, bytes, &h_ref[0], 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to read from ref");
- }
-
- for (int i = 0; i < num; i++) {
- if (expected_results[i] != h_ref[i]) {
- log_error("Values do not match at index %d expected = %d got = %d\n", i, expected_results[i], h_ref[i]);
- return -1;
- }
- }
-
- clProgramWrapper prog;
- err = get_program_with_il(prog, deviceID, context, spvName);
- SPIRV_CHECK_ERROR(err, "Failed to build program");
-
- clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err);
- SPIRV_CHECK_ERROR(err, "Failed to create spv kernel");
-
- clMemWrapper res = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
- SPIRV_CHECK_ERROR(err, "Failed to create res buffer");
-
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &res);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 0");
-
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 1");
-
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs);
- SPIRV_CHECK_ERROR(err, "Failed to set arg 2");
-
- size_t global = num;
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");
-
- std::vector<T> h_res(num);
- err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL);
- SPIRV_CHECK_ERROR(err, "Failed to read from ref");
-
- for (int i = 0; i < num; i++) {
- if (expected_results[i] != h_res[i]) {
- log_error("Values do not match at location %d expected = %d got = %d\n", i, expected_results[i], h_res[i]);
- return -1;
- }
- }
-
- return 0;
-}
-
-#define TEST_FMATH_FUNC(TYPE, FUNC) \
- TEST_SPIRV_FUNC(ext_cl_khr_spirv_no_integer_wrap_decoration_##FUNC##_##TYPE) \
- { \
- return test_ext_cl_khr_spirv_no_integer_wrap_decoration<cl_##TYPE>(deviceID, context, queue, \
- "ext_cl_khr_spirv_no_integer_wrap_decoration_"#FUNC"_"#TYPE, \
- #FUNC, \
- #TYPE \
- ); \
- }
-
-TEST_FMATH_FUNC(int, fadd)
-TEST_FMATH_FUNC(int, fsub)
-TEST_FMATH_FUNC(int, fmul)
-TEST_FMATH_FUNC(int, fshiftleft)
-TEST_FMATH_FUNC(int, fnegate)
-TEST_FMATH_FUNC(uint, fadd)
-TEST_FMATH_FUNC(uint, fsub)
-TEST_FMATH_FUNC(uint, fmul)
-TEST_FMATH_FUNC(uint, fshiftleft)
\ No newline at end of file +/****************************************************************** +Copyright (c) 2018 The Khronos Group Inc. All Rights Reserved. + +This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc. +This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to +third parties, and may not be reproduced, republished, distributed, transmitted, displayed, +broadcast or otherwise exploited in any manner without the express prior written permission +of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce, +disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe, +in whole or in part other than under the terms of the Khronos Adopters Agreement +or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient. +******************************************************************/ + +#include "testBase.h" +#include "types.hpp" + +#include <sstream> +#include <string> +#include <type_traits> + + +template<typename T> +int test_ext_cl_khr_spirv_no_integer_wrap_decoration(cl_device_id deviceID, + cl_context context, + cl_command_queue queue, + const char *spvName, + const char *funcName, + const char *Tname) +{ + + cl_int err = CL_SUCCESS; + const int num = 10; + std::vector<T> h_lhs(num); + std::vector<T> h_rhs(num); + std::vector<T> expected_results(num); + std::vector<T> h_ref(num); + if (!is_extension_available(deviceID, "cl_khr_spirv_no_integer_wrap_decoration")) { + log_info("Extension cl_khr_spirv_no_integer_wrap_decoration not supported; skipping tests.\n"); + return 0; + } + + /*Test with some values that do not cause overflow*/ + if (std::is_signed<T>::value == true) { + h_lhs.push_back((T)-25000); + h_lhs.push_back((T)-3333); + h_lhs.push_back((T)-7); + h_lhs.push_back((T)-1); + h_lhs.push_back(0); + h_lhs.push_back(1); + h_lhs.push_back(1024); + h_lhs.push_back(2048); + h_lhs.push_back(4094); + h_lhs.push_back(10000); + } else { + h_lhs.push_back(0); + h_lhs.push_back(1); + h_lhs.push_back(3); + h_lhs.push_back(5); + h_lhs.push_back(10); + h_lhs.push_back(100); + h_lhs.push_back(1024); + h_lhs.push_back(2048); + h_lhs.push_back(4094); + h_lhs.push_back(52888); + } + + h_rhs.push_back(0); + h_rhs.push_back(1); + h_rhs.push_back(2); + h_rhs.push_back(3); + h_rhs.push_back(4); + h_rhs.push_back(5); + h_rhs.push_back(6); + h_rhs.push_back(7); + h_rhs.push_back(8); + h_rhs.push_back(9); + size_t bytes = num * sizeof(T); + + clMemWrapper lhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err); + SPIRV_CHECK_ERROR(err, "Failed to create lhs buffer"); + + err = clEnqueueWriteBuffer(queue, lhs, CL_TRUE, 0, bytes, &h_lhs[0], 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to copy to lhs buffer"); + + clMemWrapper rhs = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, &err); + SPIRV_CHECK_ERROR(err, "Failed to create rhs buffer"); + + err = clEnqueueWriteBuffer(queue, rhs, CL_TRUE, 0, bytes, &h_rhs[0], 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to copy to rhs buffer"); + + std::string kernelStr; + + { + std::stringstream kernelStream; + kernelStream << "#define spirv_fadd(a, b) (a) + (b) \n"; + kernelStream << "#define spirv_fsub(a, b) (a) - (b) \n"; + kernelStream << "#define spirv_fmul(a, b) (a) * (b) \n"; + kernelStream << "#define spirv_fshiftleft(a, b) (a) << (b) \n"; + kernelStream << "#define spirv_fnegate(a, b) (-a) \n"; + + kernelStream << "#define T " << Tname << "\n"; + kernelStream << "#define FUNC spirv_" << funcName << "\n"; + kernelStream << "__kernel void fmath_cl(__global T *out, \n"; + kernelStream << "const __global T *lhs, const __global T *rhs) \n"; + kernelStream << "{ \n"; + kernelStream << " int id = get_global_id(0); \n"; + kernelStream << " out[id] = FUNC(lhs[id], rhs[id]); \n"; + kernelStream << "} \n"; + kernelStr = kernelStream.str(); + } + + const char *kernelBuf = kernelStr.c_str(); + + for (int i = 0; i < num; i++) { + if (std::string(funcName) == std::string("fadd")) { + expected_results[i] = h_lhs[i] + h_rhs[i]; + } else if (std::string(funcName) == std::string("fsub")) { + expected_results[i] = h_lhs[i] - h_rhs[i]; + } else if (std::string(funcName) == std::string("fmul")) { + expected_results[i] = h_lhs[i] * h_rhs[i]; + } else if (std::string(funcName) == std::string("fshiftleft")) { + expected_results[i] = h_lhs[i] << h_rhs[i]; + } else if (std::string(funcName) == std::string("fnegate")) { + expected_results[i] = 0 - h_lhs[i]; + } + } + + { + // Run the cl kernel for reference results + clProgramWrapper prog; + clKernelWrapper kernel; + err = create_single_kernel_helper(context, &prog, &kernel, 1, + &kernelBuf, "fmath_cl"); + SPIRV_CHECK_ERROR(err, "Failed to create cl kernel"); + + clMemWrapper ref = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err); + SPIRV_CHECK_ERROR(err, "Failed to create ref buffer"); + + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &ref); + SPIRV_CHECK_ERROR(err, "Failed to set arg 0"); + + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs); + SPIRV_CHECK_ERROR(err, "Failed to set arg 1"); + + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs); + SPIRV_CHECK_ERROR(err, "Failed to set arg 2"); + + size_t global = num; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel"); + + err = clEnqueueReadBuffer(queue, ref, CL_TRUE, 0, bytes, &h_ref[0], 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to read from ref"); + } + + for (int i = 0; i < num; i++) { + if (expected_results[i] != h_ref[i]) { + log_error("Values do not match at index %d expected = %d got = %d\n", i, expected_results[i], h_ref[i]); + return -1; + } + } + + clProgramWrapper prog; + err = get_program_with_il(prog, deviceID, context, spvName); + SPIRV_CHECK_ERROR(err, "Failed to build program"); + + clKernelWrapper kernel = clCreateKernel(prog, "fmath_cl", &err); + SPIRV_CHECK_ERROR(err, "Failed to create spv kernel"); + + clMemWrapper res = clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err); + SPIRV_CHECK_ERROR(err, "Failed to create res buffer"); + + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &res); + SPIRV_CHECK_ERROR(err, "Failed to set arg 0"); + + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &lhs); + SPIRV_CHECK_ERROR(err, "Failed to set arg 1"); + + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &rhs); + SPIRV_CHECK_ERROR(err, "Failed to set arg 2"); + + size_t global = num; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel"); + + std::vector<T> h_res(num); + err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL); + SPIRV_CHECK_ERROR(err, "Failed to read from ref"); + + for (int i = 0; i < num; i++) { + if (expected_results[i] != h_res[i]) { + log_error("Values do not match at location %d expected = %d got = %d\n", i, expected_results[i], h_res[i]); + return -1; + } + } + + return 0; +} + +#define TEST_FMATH_FUNC(TYPE, FUNC) \ + TEST_SPIRV_FUNC(ext_cl_khr_spirv_no_integer_wrap_decoration_##FUNC##_##TYPE) \ + { \ + return test_ext_cl_khr_spirv_no_integer_wrap_decoration<cl_##TYPE>(deviceID, context, queue, \ + "ext_cl_khr_spirv_no_integer_wrap_decoration_"#FUNC"_"#TYPE, \ + #FUNC, \ + #TYPE \ + ); \ + } + +TEST_FMATH_FUNC(int, fadd) +TEST_FMATH_FUNC(int, fsub) +TEST_FMATH_FUNC(int, fmul) +TEST_FMATH_FUNC(int, fshiftleft) +TEST_FMATH_FUNC(int, fnegate) +TEST_FMATH_FUNC(uint, fadd) +TEST_FMATH_FUNC(uint, fsub) +TEST_FMATH_FUNC(uint, fmul) +TEST_FMATH_FUNC(uint, fshiftleft) diff --git a/test_conformance/spirv_new/test_op_fmath.cpp b/test_conformance/spirv_new/test_op_fmath.cpp index bec0667c..61e2864d 100644 --- a/test_conformance/spirv_new/test_op_fmath.cpp +++ b/test_conformance/spirv_new/test_op_fmath.cpp @@ -79,11 +79,8 @@ int test_fmath(cl_device_id deviceID, kernelStr = kernelStream.str(); } - size_t kernelLen = kernelStr.size(); const char *kernelBuf = kernelStr.c_str(); - const char *options = fast_math ? "-cl-fast-relaxed-math" : NULL; - std::vector<T> h_ref(num); { diff --git a/test_conformance/spirv_new/test_op_function.cpp b/test_conformance/spirv_new/test_op_function.cpp index caa3e0d3..16183e80 100644 --- a/test_conformance/spirv_new/test_op_function.cpp +++ b/test_conformance/spirv_new/test_op_function.cpp @@ -33,7 +33,6 @@ int test_function(cl_device_id deviceID, err = clEnqueueWriteBuffer(queue, in, CL_TRUE, 0, bytes, &h_in[0], 0, NULL, NULL); SPIRV_CHECK_ERROR(err, "Failed to copy to in buffer"); - cl_uint bits = sizeof(void *) * 8; std::string spvStr = std::string("op_function") + "_" + std::string(funcType); const char *spvName = spvStr.c_str(); diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp index 1891c9bb..e3dc1f34 100644 --- a/test_conformance/spirv_new/test_op_negate.cpp +++ b/test_conformance/spirv_new/test_op_negate.cpp @@ -43,7 +43,6 @@ int test_negation(cl_device_id deviceID, err = clEnqueueWriteBuffer(queue, in, CL_TRUE, 0, bytes, &h_in[0], 0, NULL, NULL); SPIRV_CHECK_ERROR(err, "Failed to copy to in buffer"); - cl_uint bits = sizeof(void *) * 8; std::string spvStr = std::string(funcName) + "_" + std::string(Tname); const char *spvName = spvStr.c_str(); diff --git a/test_conformance/spirv_new/test_op_opaque.cpp b/test_conformance/spirv_new/test_op_opaque.cpp index 067d9e4e..e6216061 100644 --- a/test_conformance/spirv_new/test_op_opaque.cpp +++ b/test_conformance/spirv_new/test_op_opaque.cpp @@ -17,7 +17,6 @@ or Khronos Conformance Test Source License Agreement as executed between Khronos TEST_SPIRV_FUNC(op_type_opaque_simple) { const char *name = "opaque"; - int num = (int)(1 << 10); cl_int err = CL_SUCCESS; std::vector<unsigned char> buffer_vec = readSPIRV(name); diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp index 0a604bcf..0859668c 100644 --- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp +++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp @@ -75,7 +75,6 @@ int test_vector_times_scalar(cl_device_id deviceID, kernelStr = kernelStream.str(); } - size_t kernelLen = kernelStr.size(); const char *kernelBuf = kernelStr.c_str(); std::vector<Tv> h_ref(num); @@ -107,7 +106,6 @@ int test_vector_times_scalar(cl_device_id deviceID, SPIRV_CHECK_ERROR(err, "Failed to read from ref"); } - cl_uint bits = sizeof(void *) * 8; std::string ref = "vector_times_scalar_"; ref += Tname; const char *spvName = ref.c_str(); diff --git a/test_conformance/subgroups/CMakeLists.txt b/test_conformance/subgroups/CMakeLists.txt index d48af9cc..1ff249cf 100644 --- a/test_conformance/subgroups/CMakeLists.txt +++ b/test_conformance/subgroups/CMakeLists.txt @@ -15,6 +15,7 @@ set(${MODULE_NAME}_SOURCES test_subgroup_clustered_reduce.cpp test_subgroup_shuffle.cpp test_subgroup_shuffle_relative.cpp + test_subgroup_rotate.cpp ) include(../CMakeCommon.txt) diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp index 44416dd7..a3ae910d 100644 --- a/test_conformance/subgroups/main.cpp +++ b/test_conformance/subgroups/main.cpp @@ -19,8 +19,10 @@ #include <string.h> #include "procs.h" #include "harness/testHarness.h" +#include "CL/cl_half.h" MTdata gMTdata; +cl_half_rounding_mode g_rounding_mode; test_definition test_list[] = { ADD_TEST_VERSION(sub_group_info_ext, Version(2, 0)), @@ -39,7 +41,8 @@ test_definition test_list[] = { ADD_TEST(subgroup_functions_ballot), ADD_TEST(subgroup_functions_clustered_reduce), ADD_TEST(subgroup_functions_shuffle), - ADD_TEST(subgroup_functions_shuffle_relative) + ADD_TEST(subgroup_functions_shuffle_relative), + ADD_TEST(subgroup_functions_rotate), }; const int test_num = ARRAY_SIZE(test_list); @@ -66,6 +69,22 @@ static test_status InitCL(cl_device_id device) ret = TEST_SKIP; } } + // Determine the rounding mode to be used in float to half conversions in + // init and reference code + const cl_device_fp_config fpConfig = get_default_rounding_mode(device); + + if (fpConfig == CL_FP_ROUND_TO_NEAREST) + { + g_rounding_mode = CL_HALF_RTE; + } + else if (fpConfig == CL_FP_ROUND_TO_ZERO && gIsEmbedded) + { + g_rounding_mode = CL_HALF_RTZ; + } + else + { + assert(false && "Unreachable"); + } return ret; } diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h index d09e8242..d4f51bec 100644 --- a/test_conformance/subgroups/procs.h +++ b/test_conformance/subgroups/procs.h @@ -81,4 +81,8 @@ extern int test_subgroup_functions_shuffle_relative(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements); +extern int test_subgroup_functions_rotate(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); #endif /*_procs_h*/ diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp index f8b24450..33a51637 100644 --- a/test_conformance/subgroups/subgroup_common_kernels.cpp +++ b/test_conformance/subgroups/subgroup_common_kernels.cpp @@ -15,92 +15,20 @@ // #include "subgroup_common_kernels.h" -const char* bcast_source = - "__kernel void test_bcast(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint which_sub_group_local_id = xy[gid].z;\n" - " out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n" - "}\n"; - -const char* redadd_source = "__kernel void test_redadd(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_add(in[gid]);\n" - "}\n"; - -const char* redmax_source = "__kernel void test_redmax(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_max(in[gid]);\n" - "}\n"; - -const char* redmin_source = "__kernel void test_redmin(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_min(in[gid]);\n" - "}\n"; - -const char* scinadd_source = - "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_add(in[gid]);\n" - "}\n"; - -const char* scinmax_source = - "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_max(in[gid]);\n" - "}\n"; - -const char* scinmin_source = - "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_min(in[gid]);\n" - "}\n"; - -const char* scexadd_source = - "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_add(in[gid]);\n" - "}\n"; - -const char* scexmax_source = - "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_max(in[gid]);\n" - "}\n"; - -const char* scexmin_source = - "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_min(in[gid]);\n" - "}\n"; +std::string sub_group_reduction_scan_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + out[gid] = %s(in[gid]); + } +)"; + +std::string sub_group_generic_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + out[gid] = %s(x, xy[gid].z); + } +)";
\ No newline at end of file diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h index 8ae97d9a..bf2210ef 100644 --- a/test_conformance/subgroups/subgroup_common_kernels.h +++ b/test_conformance/subgroups/subgroup_common_kernels.h @@ -18,15 +18,7 @@ #include "subhelpers.h" -extern const char* bcast_source; -extern const char* redadd_source; -extern const char* redmax_source; -extern const char* redmin_source; -extern const char* scinadd_source; -extern const char* scinmax_source; -extern const char* scinmin_source; -extern const char* scexadd_source; -extern const char* scexmax_source; -extern const char* scexmin_source; +extern std::string sub_group_reduction_scan_source; +extern std::string sub_group_generic_source; #endif diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h index b30c416b..b2648c30 100644 --- a/test_conformance/subgroups/subgroup_common_templates.h +++ b/test_conformance/subgroups/subgroup_common_templates.h @@ -17,13 +17,12 @@ #define SUBGROUPCOMMONTEMPLATES_H #include "typeWrappers.h" -#include <bitset> #include "CL/cl_half.h" #include "subhelpers.h" - #include <set> +#include <algorithm> +#include <random> -typedef std::bitset<128> bs128; static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, const std::string &mask_type, cl_uint max_sub_group_size) @@ -66,6 +65,13 @@ static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, // only 4 work_items from subgroup enter the code (are active) template <typename Ty, SubgroupsBroadcastOp operation> struct BC { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_%s(%s)...%s\n", operation_names(operation), + TypeManager<Ty>::name(), extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int i, ii, j, k, n; @@ -79,11 +85,8 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC int last_subgroup_size = 0; ii = 0; - log_info(" sub_group_%s(%s)...\n", operation_names(operation), - TypeManager<Ty>::name()); if (non_uniform_size) { - log_info(" non uniform work group size mode ON\n"); ng++; } for (k = 0; k < ng; ++k) @@ -172,8 +175,8 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC } } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int ii, i, j, k, l, n; int ng = test_params.global_workgroup_size; @@ -290,8 +293,6 @@ template <typename Ty, SubgroupsBroadcastOp operation> struct BC y += nw; m += 4 * nw; } - log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), - TypeManager<Ty>::name()); return TEST_PASS; } }; @@ -301,7 +302,7 @@ static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); } static subgroups::cl_half to_half(float x) { subgroups::cl_half value; - value.data = cl_half_from_float(x, CL_HALF_RTE); + value.data = cl_half_from_float(x, g_rounding_mode); return value; } @@ -320,7 +321,7 @@ template <typename Ty> inline Ty calculate(Ty a, Ty b, ArithmeticOp operation) case ArithmeticOp::logical_and: return a && b; case ArithmeticOp::logical_or: return a || b; case ArithmeticOp::logical_xor: return !a ^ !b; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -342,7 +343,7 @@ inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation) case ArithmeticOp::mul_: { return a * b; } - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -364,7 +365,7 @@ inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation) case ArithmeticOp::mul_: { return a * b; } - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -381,7 +382,7 @@ inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b, case ArithmeticOp::min_: return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b; case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b)); - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return to_half(0); } @@ -392,11 +393,44 @@ template <typename Ty> bool is_floating_point() || std::is_same<Ty, subgroups::cl_half>::value; } +// limit possible input values to avoid arithmetic rounding/overflow issues. +// for each subgroup values defined different values +// for rest of workitems set 1 +// shuffle values +static void fill_and_shuffle_safe_values(std::vector<cl_ulong> &safe_values, + int sb_size) +{ + // max product is 720, cl_half has enough precision for it + const std::vector<cl_ulong> non_one_values{ 2, 3, 4, 5, 6 }; + + if (sb_size <= non_one_values.size()) + { + safe_values.assign(non_one_values.begin(), + non_one_values.begin() + sb_size); + } + else + { + safe_values.assign(sb_size, 1); + std::copy(non_one_values.begin(), non_one_values.end(), + safe_values.begin()); + } + + std::mt19937 mersenne_twister_engine(10000); + std::shuffle(safe_values.begin(), safe_values.end(), + mersenne_twister_engine); +}; + template <typename Ty, ArithmeticOp operation> -void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) +void generate_inputs(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) { int nj = (nw + ns - 1) / ns; + std::vector<cl_ulong> safe_values; + if (operation == ArithmeticOp::mul_ || operation == ArithmeticOp::add_) + { + fill_and_shuffle_safe_values(safe_values, ns); + } + for (int k = 0; k < ng; ++k) { for (int j = 0; j < nj; ++j) @@ -407,13 +441,10 @@ void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) for (int i = 0; i < n; ++i) { cl_ulong out_value; - double y; if (operation == ArithmeticOp::mul_ || operation == ArithmeticOp::add_) { - // work around to avoid overflow, do not use 0 for - // multiplication - out_value = (genrand_int32(gMTdata) % 4) + 1; + out_value = safe_values[i]; } else { @@ -441,18 +472,23 @@ void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) template <typename Ty, ShuffleOp operation> struct SHF { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_%s(%s)...%s\n", operation_names(operation), + TypeManager<Ty>::name(), extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { - int i, ii, j, k, l, n, delta; + int i, ii, j, k, n; + cl_uint l; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; - int d = ns > 100 ? 100 : ns; ii = 0; ng = ng / nw; - log_info(" sub_group_%s(%s)...\n", operation_names(operation), - TypeManager<Ty>::name()); for (k = 0; k < ng; ++k) { // for each work_group for (j = 0; j < nj; ++j) @@ -462,30 +498,31 @@ template <typename Ty, ShuffleOp operation> struct SHF for (i = 0; i < n; ++i) { int midx = 4 * ii + 4 * i + 2; - l = (int)(genrand_int32(gMTdata) & 0x7fffffff) - % (d > n ? n : d); + l = (((cl_uint)(genrand_int32(gMTdata) & 0x7fffffff) + 1) + % (ns * 2 + 1)) + - 1; switch (operation) { case ShuffleOp::shuffle: case ShuffleOp::shuffle_xor: - // storing information about shuffle index + case ShuffleOp::shuffle_up: + case ShuffleOp::shuffle_down: + // storing information about shuffle index/delta m[midx] = (cl_int)l; break; - case ShuffleOp::shuffle_up: - delta = l; // calculate delta for shuffle up - if (i - delta < 0) + case ShuffleOp::rotate: + case ShuffleOp::clustered_rotate: + // Storing information about rotate delta. + // The delta must be the same for each thread in + // the subgroup. + if (i == 0) { - delta = i; + m[midx] = (cl_int)l; } - m[midx] = (cl_int)delta; - break; - case ShuffleOp::shuffle_down: - delta = l; // calculate delta for shuffle down - if (i + delta >= n) + else { - delta = n - 1 - i; + m[midx] = m[midx - 4]; } - m[midx] = (cl_int)delta; break; default: break; } @@ -503,10 +540,11 @@ template <typename Ty, ShuffleOp operation> struct SHF } } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { - int ii, i, j, k, l, n; + int ii, i, j, k, n; + cl_uint l; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; @@ -531,32 +569,51 @@ template <typename Ty, ShuffleOp operation> struct SHF { // inside the subgroup // shuffle index storage int midx = 4 * ii + 4 * i + 2; - l = (int)m[midx]; + l = m[midx]; rr = my[ii + i]; + cl_uint tr_idx; + bool skip = false; switch (operation) { // shuffle basic - treat l as index - case ShuffleOp::shuffle: tr = mx[ii + l]; break; - // shuffle up - treat l as delta - case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break; + case ShuffleOp::shuffle: tr_idx = l; break; + // shuffle xor - treat l as mask + case ShuffleOp::shuffle_xor: tr_idx = i ^ l; break; // shuffle up - treat l as delta + case ShuffleOp::shuffle_up: + if (l >= ns) skip = true; + tr_idx = i - l; + break; + // shuffle down - treat l as delta case ShuffleOp::shuffle_down: - tr = mx[ii + i + l]; + if (l >= ns) skip = true; + tr_idx = i + l; break; - // shuffle xor - treat l as mask - case ShuffleOp::shuffle_xor: - tr = mx[ii + (i ^ l)]; + // rotate - treat l as delta + case ShuffleOp::rotate: + tr_idx = (i + l) % test_params.subgroup_size; + break; + case ShuffleOp::clustered_rotate: { + tr_idx = ((i & ~(test_params.cluster_size - 1)) + + ((i + l) % test_params.cluster_size)); break; + } default: break; } - if (!compare(rr, tr)) + if (!skip && tr_idx < n) { - log_error("ERROR: sub_group_%s(%s) mismatch for " - "local id %d in sub group %d in group %d\n", - operation_names(operation), - TypeManager<Ty>::name(), i, j, k); - return TEST_FAIL; + tr = mx[ii + tr_idx]; + + if (!compare(rr, tr)) + { + log_error("ERROR: sub_group_%s(%s) mismatch for " + "local id %d in sub group %d in group " + "%d\n", + operation_names(operation), + TypeManager<Ty>::name(), i, j, k); + return TEST_FAIL; + } } } } @@ -564,51 +621,53 @@ template <typename Ty, ShuffleOp operation> struct SHF y += nw; m += 4 * nw; } - log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), - TypeManager<Ty>::name()); return TEST_PASS; } }; template <typename Ty, ArithmeticOp operation> struct SCEX_NU { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_scan_exclusive" + : "sub_group_scan_exclusive"); + log_info(" %s_%s(%s)...%s\n", func_name.c_str(), + operation_names(operation), TypeManager<Ty>::name(), + extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; - std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" - : func_name = "sub_group_scan_exclusive"; - log_info(" %s_%s(%s)...\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); - log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); - genrand<Ty, operation>(x, t, m, ns, nw, ng); + generate_inputs<Ty, operation>(x, t, m, ns, nw, ng); } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; - std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" - : func_name = "sub_group_scan_exclusive"; + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_scan_exclusive" + : "sub_group_scan_exclusive"); - uint32_t use_work_items_mask; // for uniform case take into consideration all workitems - use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } for (k = 0; k < ng; ++k) { // for each work_group // Map to array indexed to array indexed by local ID and sub group @@ -624,35 +683,21 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU std::set<int> active_work_items; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { active_work_items.insert(i); } } if (active_work_items.empty()) { - log_info(" No acitve workitems in workgroup id = %d " - "subgroup id = %d - no calculation\n", - k, j); - continue; - } - else if (active_work_items.size() == 1) - { - log_info(" One active workitem in workgroup id = %d " - "subgroup id = %d - no calculation\n", - k, j); continue; } else { tr = TypeManager<Ty>::identify_limits(operation); - int idx = 0; for (const int &active_work_item : active_work_items) { rr = my[ii + active_work_item]; - if (idx == 0) continue; - if (!compare_ordered(rr, tr)) { log_error( @@ -665,7 +710,6 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU } tr = calculate<Ty>(tr, mx[ii + active_work_item], operation); - idx++; } } } @@ -674,8 +718,6 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU m += 4 * nw; } - log_info(" %s_%s(%s)... passed\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); return TEST_PASS; } }; @@ -683,44 +725,48 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU // Test for scan inclusive non uniform functions template <typename Ty, ArithmeticOp operation> struct SCIN_NU { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_scan_inclusive" + : "sub_group_scan_inclusive"); + log_info(" %s_%s(%s)...%s\n", func_name.c_str(), + operation_names(operation), TypeManager<Ty>::name(), + extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; - std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" - : func_name = "sub_group_scan_inclusive"; - - genrand<Ty, operation>(x, t, m, ns, nw, ng); - log_info(" %s_%s(%s)...\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); - log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); + generate_inputs<Ty, operation>(x, t, m, ns, nw, ng); } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; - std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" - : func_name = "sub_group_scan_inclusive"; + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_scan_inclusive" + : "sub_group_scan_inclusive"); - uint32_t use_work_items_mask; // for uniform case take into consideration all workitems - use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } // std::bitset<32> mask32(use_work_items_mask); // for (int k) mask32.count(); for (k = 0; k < ng; ++k) @@ -740,8 +786,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { if (catch_frist_active == -1) { @@ -752,9 +797,6 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU } if (active_work_items.empty()) { - log_info(" No acitve workitems in workgroup id = %d " - "subgroup id = %d - no calculation\n", - k, j); continue; } else @@ -792,8 +834,6 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU m += 4 * nw; } - log_info(" %s_%s(%s)... passed\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); return TEST_PASS; } }; @@ -801,41 +841,41 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU // Test for reduce non uniform functions template <typename Ty, ArithmeticOp operation> struct RED_NU { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_reduce" + : "sub_group_reduce"); + log_info(" %s_%s(%s)...%s\n", func_name.c_str(), + operation_names(operation), TypeManager<Ty>::name(), + extra_text); + } static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; - std::string func_name; - - work_items_mask ? func_name = "sub_group_non_uniform_reduce" - : func_name = "sub_group_reduce"; - log_info(" %s_%s(%s)...\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); - log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); - genrand<Ty, operation>(x, t, m, ns, nw, ng); + generate_inputs<Ty, operation>(x, t, m, ns, nw, ng); } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; ng = ng / nw; Ty tr, rr; - std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_reduce" - : func_name = "sub_group_reduce"; + std::string func_name = (test_params.all_work_item_masks.size() > 0 + ? "sub_group_non_uniform_reduce" + : "sub_group_reduce"); for (k = 0; k < ng; ++k) { @@ -847,9 +887,10 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU my[j] = y[j]; } - uint32_t use_work_items_mask; - use_work_items_mask = - !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } for (j = 0; j < nj; ++j) { @@ -859,8 +900,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU int catch_frist_active = -1; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { if (catch_frist_active == -1) { @@ -876,9 +916,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU if (active_work_items.empty()) { - log_info(" No acitve workitems in workgroup id = %d " - "subgroup id = %d - no calculation\n", - k, j); continue; } @@ -902,8 +939,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU m += 4 * nw; } - log_info(" %s_%s(%s)... passed\n", func_name.c_str(), - operation_names(operation), TypeManager<Ty>::name()); return TEST_PASS; } }; diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h index 93673b35..0a2c3903 100644 --- a/test_conformance/subgroups/subhelpers.h +++ b/test_conformance/subgroups/subhelpers.h @@ -24,32 +24,194 @@ #include <limits> #include <vector> #include <type_traits> +#include <bitset> +#include <regex> +#include <map> #define NR_OF_ACTIVE_WORK_ITEMS 4 extern MTdata gMTdata; +typedef std::bitset<128> bs128; +extern cl_half_rounding_mode g_rounding_mode; + +static bs128 cl_uint4_to_bs128(cl_uint4 v) +{ + return bs128(v.s0) | (bs128(v.s1) << 32) | (bs128(v.s2) << 64) + | (bs128(v.s3) << 96); +} + +static cl_uint4 bs128_to_cl_uint4(bs128 v) +{ + bs128 bs128_ffffffff = 0xffffffffU; + + cl_uint4 r; + r.s0 = ((v >> 0) & bs128_ffffffff).to_ulong(); + r.s1 = ((v >> 32) & bs128_ffffffff).to_ulong(); + r.s2 = ((v >> 64) & bs128_ffffffff).to_ulong(); + r.s3 = ((v >> 96) & bs128_ffffffff).to_ulong(); + + return r; +} struct WorkGroupParams { - WorkGroupParams(size_t gws, size_t lws, - const std::vector<std::string> &req_ext = {}, - const std::vector<uint32_t> &all_wim = {}) + + WorkGroupParams(size_t gws, size_t lws, int dm_arg = -1, int cs_arg = -1) : global_workgroup_size(gws), local_workgroup_size(lws), - required_extensions(req_ext), all_work_item_masks(all_wim) + divergence_mask_arg(dm_arg), cluster_size_arg(cs_arg) { subgroup_size = 0; + cluster_size = 0; work_items_mask = 0; use_core_subgroups = true; dynsc = 0; + load_masks(); } size_t global_workgroup_size; size_t local_workgroup_size; size_t subgroup_size; - uint32_t work_items_mask; - int dynsc; + cl_uint cluster_size; + bs128 work_items_mask; + size_t dynsc; bool use_core_subgroups; - std::vector<std::string> required_extensions; - std::vector<uint32_t> all_work_item_masks; + std::vector<bs128> all_work_item_masks; + int divergence_mask_arg; + int cluster_size_arg; + void save_kernel_source(const std::string &source, std::string name = "") + { + if (name == "") + { + name = "default"; + } + if (kernel_function_name.find(name) != kernel_function_name.end()) + { + log_info("Kernel definition duplication. Source will be " + "overwritten for function name %s\n", + name.c_str()); + } + kernel_function_name[name] = source; + }; + // return specific defined kernel or default. + std::string get_kernel_source(std::string name) + { + if (kernel_function_name.find(name) == kernel_function_name.end()) + { + return kernel_function_name["default"]; + } + return kernel_function_name[name]; + } + + +private: + std::map<std::string, std::string> kernel_function_name; + void load_masks() + { + if (divergence_mask_arg != -1) + { + // 1 in string will be set 1, 0 will be set 0 + bs128 mask_0xf0f0f0f0("11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xf0f0f0f0); + // 1 in string will be set 0, 0 will be set 1 + bs128 mask_0x0f0f0f0f("11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000", + 128, '1', '0'); + all_work_item_masks.push_back(mask_0x0f0f0f0f); + bs128 mask_0x5555aaaa("10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x5555aaaa); + bs128 mask_0xaaaa5555("10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010", + 128, '1', '0'); + all_work_item_masks.push_back(mask_0xaaaa5555); + // 0x0f0ff0f0 + bs128 mask_0x0f0ff0f0("00001111000011111111000011110000" + "00001111000011111111000011110000" + "00001111000011111111000011110000" + "00001111000011111111000011110000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x0f0ff0f0); + // 0xff0000ff + bs128 mask_0xff0000ff("11111111000000000000000011111111" + "11111111000000000000000011111111" + "11111111000000000000000011111111" + "11111111000000000000000011111111", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xff0000ff); + // 0xff00ff00 + bs128 mask_0xff00ff00("11111111000000001111111100000000" + "11111111000000001111111100000000" + "11111111000000001111111100000000" + "11111111000000001111111100000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xff00ff00); + // 0x00ffff00 + bs128 mask_0x00ffff00("00000000111111111111111100000000" + "00000000111111111111111100000000" + "00000000111111111111111100000000" + "00000000111111111111111100000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x00ffff00); + // 0x80 1 workitem highest id for 8 subgroup size + bs128 mask_0x80808080("10000000100000001000000010000000" + "10000000100000001000000010000000" + "10000000100000001000000010000000" + "10000000100000001000000010000000", + 128, '0', '1'); + + all_work_item_masks.push_back(mask_0x80808080); + // 0x8000 1 workitem highest id for 16 subgroup size + bs128 mask_0x80008000("10000000000000001000000000000000" + "10000000000000001000000000000000" + "10000000000000001000000000000000" + "10000000000000001000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x80008000); + // 0x80000000 1 workitem highest id for 32 subgroup size + bs128 mask_0x80000000("10000000000000000000000000000000" + "10000000000000000000000000000000" + "10000000000000000000000000000000" + "10000000000000000000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x80000000); + // 0x80000000 00000000 1 workitem highest id for 64 subgroup size + // 0x80000000 1 workitem highest id for 32 subgroup size + bs128 mask_0x8000000000000000("10000000000000000000000000000000" + "00000000000000000000000000000000" + "10000000000000000000000000000000" + "00000000000000000000000000000000", + 128, '0', '1'); + + all_work_item_masks.push_back(mask_0x8000000000000000); + // 0x80000000 00000000 00000000 00000000 1 workitem highest id for + // 128 subgroup size + bs128 mask_0x80000000000000000000000000000000( + "10000000000000000000000000000000" + "00000000000000000000000000000000" + "00000000000000000000000000000000" + "00000000000000000000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back( + mask_0x80000000000000000000000000000000); + + bs128 mask_0xffffffff("11111111111111111111111111111111" + "11111111111111111111111111111111" + "11111111111111111111111111111111" + "11111111111111111111111111111111", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xffffffff); + } + } }; enum class SubgroupsBroadcastOp @@ -89,7 +251,9 @@ enum class ShuffleOp shuffle, shuffle_up, shuffle_down, - shuffle_xor + shuffle_xor, + rotate, + clustered_rotate, }; enum class ArithmeticOp @@ -120,7 +284,7 @@ static const char *const operation_names(ArithmeticOp operation) case ArithmeticOp::logical_and: return "logical_and"; case ArithmeticOp::logical_or: return "logical_or"; case ArithmeticOp::logical_xor: return "logical_xor"; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return ""; } @@ -142,7 +306,7 @@ static const char *const operation_names(BallotOp operation) case BallotOp::gt_mask: return "gt"; case BallotOp::le_mask: return "le"; case BallotOp::lt_mask: return "lt"; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return ""; } @@ -155,7 +319,9 @@ static const char *const operation_names(ShuffleOp operation) case ShuffleOp::shuffle_up: return "shuffle_up"; case ShuffleOp::shuffle_down: return "shuffle_down"; case ShuffleOp::shuffle_xor: return "shuffle_xor"; - default: log_error("Unknown operation request"); break; + case ShuffleOp::rotate: return "rotate"; + case ShuffleOp::clustered_rotate: return "clustered_rotate"; + default: log_error("Unknown operation request\n"); break; } return ""; } @@ -168,7 +334,7 @@ static const char *const operation_names(NonUniformVoteOp operation) case NonUniformVoteOp::all_equal: return "all_equal"; case NonUniformVoteOp::any: return "any"; case NonUniformVoteOp::elect: return "elect"; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return ""; } @@ -181,7 +347,7 @@ static const char *const operation_names(SubgroupsBroadcastOp operation) case SubgroupsBroadcastOp::broadcast_first: return "broadcast_first"; case SubgroupsBroadcastOp::non_uniform_broadcast: return "non_uniform_broadcast"; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return ""; } @@ -358,7 +524,7 @@ template <typename Ty> struct CommonTypeManager case ArithmeticOp::and_: return (Ty)~0; case ArithmeticOp::or_: return (Ty)0; case ArithmeticOp::xor_: return (Ty)0; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -386,7 +552,7 @@ template <> struct TypeManager<cl_int> : public CommonTypeManager<cl_int> case ArithmeticOp::logical_and: return (cl_int)1; case ArithmeticOp::logical_or: return (cl_int)0; case ArithmeticOp::logical_xor: return (cl_int)0; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -800,7 +966,7 @@ template <> struct TypeManager<cl_float> : public CommonTypeManager<cl_float> case ArithmeticOp::min_: return std::numeric_limits<float>::infinity(); case ArithmeticOp::mul_: return (cl_float)1; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -859,7 +1025,7 @@ template <> struct TypeManager<cl_double> : public CommonTypeManager<cl_double> case ArithmeticOp::min_: return std::numeric_limits<double>::infinity(); case ArithmeticOp::mul_: return (cl_double)1; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return 0; } @@ -946,7 +1112,7 @@ struct TypeManager<subgroups::cl_half> case ArithmeticOp::max_: return { 0xfc00 }; case ArithmeticOp::min_: return { 0x7c00 }; case ArithmeticOp::mul_: return { 0x3c00 }; - default: log_error("Unknown operation request"); break; + default: log_error("Unknown operation request\n"); break; } return { 0 }; } @@ -1080,7 +1246,7 @@ template <typename Ty> typename std::enable_if<TypeManager<Ty>::is_sb_scalar_type::value>::type set_value(Ty &lhs, const cl_ulong &rhs) { - lhs.data = rhs; + lhs.data = cl_half_from_float(static_cast<cl_float>(rhs), g_rounding_mode); } // compare for common vectors @@ -1164,98 +1330,172 @@ inline bool compare_ordered(const subgroups::cl_half &lhs, const int &rhs) return cl_half_to_float(lhs.data) == rhs; } -// Run a test kernel to compute the result of a built-in on an input -static int run_kernel(cl_context context, cl_command_queue queue, - cl_kernel kernel, size_t global, size_t local, - void *idata, size_t isize, void *mdata, size_t msize, - void *odata, size_t osize, size_t tsize = 0) -{ - clMemWrapper in; - clMemWrapper xy; - clMemWrapper out; - clMemWrapper tmp; - int error; +template <typename Ty, typename Fns> class KernelExecutor { +public: + KernelExecutor(cl_context c, cl_command_queue q, cl_kernel k, size_t g, + size_t l, Ty *id, size_t is, Ty *mid, Ty *mod, cl_int *md, + size_t ms, Ty *od, size_t os, size_t ts = 0) + : context(c), queue(q), kernel(k), global(g), local(l), idata(id), + isize(is), mapin_data(mid), mapout_data(mod), mdata(md), msize(ms), + odata(od), osize(os), tsize(ts) + { + has_status = false; + run_failed = false; + } + cl_context context; + cl_command_queue queue; + cl_kernel kernel; + size_t global; + size_t local; + Ty *idata; + size_t isize; + Ty *mapin_data; + Ty *mapout_data; + cl_int *mdata; + size_t msize; + Ty *odata; + size_t osize; + size_t tsize; + bool run_failed; - in = clCreateBuffer(context, CL_MEM_READ_ONLY, isize, NULL, &error); - test_error(error, "clCreateBuffer failed"); +private: + bool has_status; + test_status status; - xy = clCreateBuffer(context, CL_MEM_WRITE_ONLY, msize, NULL, &error); - test_error(error, "clCreateBuffer failed"); +public: + // Run a test kernel to compute the result of a built-in on an input + int run() + { + clMemWrapper in; + clMemWrapper xy; + clMemWrapper out; + clMemWrapper tmp; + int error; + + in = clCreateBuffer(context, CL_MEM_READ_ONLY, isize, NULL, &error); + test_error(error, "clCreateBuffer failed"); - out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, osize, NULL, &error); - test_error(error, "clCreateBuffer failed"); + xy = clCreateBuffer(context, CL_MEM_WRITE_ONLY, msize, NULL, &error); + test_error(error, "clCreateBuffer failed"); - if (tsize) - { - tmp = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, - tsize, NULL, &error); + out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, osize, NULL, &error); test_error(error, "clCreateBuffer failed"); - } - error = clSetKernelArg(kernel, 0, sizeof(in), (void *)&in); - test_error(error, "clSetKernelArg failed"); + if (tsize) + { + tmp = clCreateBuffer(context, + CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, + tsize, NULL, &error); + test_error(error, "clCreateBuffer failed"); + } - error = clSetKernelArg(kernel, 1, sizeof(xy), (void *)&xy); - test_error(error, "clSetKernelArg failed"); + error = clSetKernelArg(kernel, 0, sizeof(in), (void *)&in); + test_error(error, "clSetKernelArg failed"); - error = clSetKernelArg(kernel, 2, sizeof(out), (void *)&out); - test_error(error, "clSetKernelArg failed"); + error = clSetKernelArg(kernel, 1, sizeof(xy), (void *)&xy); + test_error(error, "clSetKernelArg failed"); - if (tsize) - { - error = clSetKernelArg(kernel, 3, sizeof(tmp), (void *)&tmp); + error = clSetKernelArg(kernel, 2, sizeof(out), (void *)&out); test_error(error, "clSetKernelArg failed"); + + if (tsize) + { + error = clSetKernelArg(kernel, 3, sizeof(tmp), (void *)&tmp); + test_error(error, "clSetKernelArg failed"); + } + + error = clEnqueueWriteBuffer(queue, in, CL_FALSE, 0, isize, idata, 0, + NULL, NULL); + test_error(error, "clEnqueueWriteBuffer failed"); + + error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, + NULL, NULL); + test_error(error, "clEnqueueWriteBuffer failed"); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, + 0, NULL, NULL); + test_error(error, "clEnqueueNDRangeKernel failed"); + + error = clEnqueueReadBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, + NULL, NULL); + test_error(error, "clEnqueueReadBuffer failed"); + + error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, osize, odata, 0, + NULL, NULL); + test_error(error, "clEnqueueReadBuffer failed"); + + error = clFinish(queue); + test_error(error, "clFinish failed"); + + return error; } - error = clEnqueueWriteBuffer(queue, in, CL_FALSE, 0, isize, idata, 0, NULL, - NULL); - test_error(error, "clEnqueueWriteBuffer failed"); +private: + test_status + run_and_check_with_cluster_size(const WorkGroupParams &test_params) + { + cl_int error = run(); + if (error != CL_SUCCESS) + { + print_error(error, "Failed to run subgroup test kernel"); + status = TEST_FAIL; + run_failed = true; + return status; + } + + test_status tmp_status = + Fns::chk(idata, odata, mapin_data, mapout_data, mdata, test_params); + + if (!has_status || tmp_status == TEST_FAIL + || (tmp_status == TEST_PASS && status != TEST_FAIL)) + { + status = tmp_status; + has_status = true; + } + + return status; + } - error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL, - NULL); - test_error(error, "clEnqueueWriteBuffer failed"); - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, - NULL, NULL); - test_error(error, "clEnqueueNDRangeKernel failed"); +public: + test_status run_and_check(WorkGroupParams &test_params) + { + test_status tmp_status = TEST_SKIPPED_ITSELF; - error = clEnqueueReadBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL, - NULL); - test_error(error, "clEnqueueReadBuffer failed"); + if (test_params.cluster_size_arg != -1) + { + for (cl_uint cluster_size = 1; + cluster_size <= test_params.subgroup_size; cluster_size *= 2) + { + test_params.cluster_size = cluster_size; + cl_int error = + clSetKernelArg(kernel, test_params.cluster_size_arg, + sizeof(cl_uint), &cluster_size); + test_error_fail(error, "Unable to set cluster size"); - error = clEnqueueReadBuffer(queue, out, CL_FALSE, 0, osize, odata, 0, NULL, - NULL); - test_error(error, "clEnqueueReadBuffer failed"); + tmp_status = run_and_check_with_cluster_size(test_params); - error = clFinish(queue); - test_error(error, "clFinish failed"); + if (tmp_status == TEST_FAIL) break; + } + } + else + { + tmp_status = run_and_check_with_cluster_size(test_params); + } - return error; -} + return tmp_status; + } +}; // Driver for testing a single built in function template <typename Ty, typename Fns, size_t TSIZE = 0> struct test { - static int mrun(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, const char *kname, - const char *src, WorkGroupParams test_params) - { - int error = TEST_PASS; - for (auto &mask : test_params.all_work_item_masks) - { - test_params.work_items_mask = mask; - error |= run(device, context, queue, num_elements, kname, src, - test_params); - } - return error; - }; - static int run(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, const char *kname, - const char *src, WorkGroupParams test_params) + static test_status run(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements, + const char *kname, const char *src, + WorkGroupParams test_params) { size_t tmp; - int error; - int subgroup_size, num_subgroups; - size_t realSize; + cl_int error; + size_t subgroup_size, num_subgroups; size_t global = test_params.global_workgroup_size; size_t local = test_params.local_workgroup_size; clProgramWrapper program; @@ -1268,13 +1508,8 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test std::vector<Ty> mapout; mapout.resize(local); std::stringstream kernel_sstr; - if (test_params.work_items_mask != 0) - { - kernel_sstr << "#define WORK_ITEMS_MASK "; - kernel_sstr << "0x" << std::hex << test_params.work_items_mask - << "\n"; - } + Fns::log_test(test_params, ""); kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS "; kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n"; @@ -1282,36 +1517,21 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test if (!TypeManager<Ty>::type_supported(device)) { log_info("Data type not supported : %s\n", TypeManager<Ty>::name()); - return 0; + return TEST_SKIPPED_ITSELF; } - else + + if (strstr(TypeManager<Ty>::name(), "double")) { - if (strstr(TypeManager<Ty>::name(), "double")) - { - kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"; - } - else if (strstr(TypeManager<Ty>::name(), "half")) - { - kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n"; - } + kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"; } - - for (std::string extension : test_params.required_extensions) + else if (strstr(TypeManager<Ty>::name(), "half")) { - if (!is_extension_available(device, extension.c_str())) - { - log_info("The extension %s not supported on this device. SKIP " - "testing - kernel %s data type %s\n", - extension.c_str(), kname, TypeManager<Ty>::name()); - return TEST_PASS; - } - kernel_sstr << "#pragma OPENCL EXTENSION " + extension - + ": enable\n"; + kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n"; } error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL); - test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM"); + test_error_fail(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM"); if (test_params.use_core_subgroups) { kernel_sstr @@ -1326,12 +1546,12 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test error = create_single_kernel_helper(context, &program, &kernel, 1, &kernel_src, kname); - if (error != 0) return error; + if (error != CL_SUCCESS) return TEST_FAIL; // Determine some local dimensions to use for the test. error = get_max_common_work_group_size( context, kernel, test_params.global_workgroup_size, &local); - test_error(error, "get_max_common_work_group_size failed"); + test_error_fail(error, "get_max_common_work_group_size failed"); // Limit it a bit so we have muliple work groups // Ideally this will still be large enough to give us multiple @@ -1345,7 +1565,7 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test subgroupsApiSet.clGetKernelSubGroupInfo_ptr(); if (clGetKernelSubGroupInfo_ptr == NULL) { - log_error("ERROR: %s function not available", + log_error("ERROR: %s function not available\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } @@ -1355,12 +1575,12 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test if (error != CL_SUCCESS) { log_error("ERROR: %s function error for " - "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE", + "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } - subgroup_size = (int)tmp; + subgroup_size = tmp; error = clGetKernelSubGroupInfo_ptr( kernel, device, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, @@ -1368,16 +1588,16 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test if (error != CL_SUCCESS) { log_error("ERROR: %s function error for " - "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE", + "CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } - num_subgroups = (int)tmp; + num_subgroups = tmp; // Make sure the number of sub groups is what we expect if (num_subgroups != (local + subgroup_size - 1) / subgroup_size) { - log_error("ERROR: unexpected number of subgroups (%d) returned\n", + log_error("ERROR: unexpected number of subgroups (%zu) returned\n", num_subgroups); return TEST_FAIL; } @@ -1386,41 +1606,83 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test std::vector<Ty> odata; size_t input_array_size = global; size_t output_array_size = global; - int dynscl = test_params.dynsc; + size_t dynscl = test_params.dynsc; if (dynscl != 0) { - input_array_size = - (int)global / (int)local * num_subgroups * dynscl; - output_array_size = (int)global / (int)local * dynscl; + input_array_size = global / local * num_subgroups * dynscl; + output_array_size = global / local * dynscl; } idata.resize(input_array_size); odata.resize(output_array_size); + if (test_params.divergence_mask_arg != -1) + { + cl_uint4 mask_vector; + mask_vector.x = 0xffffffffU; + mask_vector.y = 0xffffffffU; + mask_vector.z = 0xffffffffU; + mask_vector.w = 0xffffffffU; + error = clSetKernelArg(kernel, test_params.divergence_mask_arg, + sizeof(cl_uint4), &mask_vector); + test_error_fail(error, "Unable to set divergence mask argument"); + } + + if (test_params.cluster_size_arg != -1) + { + cl_uint dummy_cluster_size = 1; + error = clSetKernelArg(kernel, test_params.cluster_size_arg, + sizeof(cl_uint), &dummy_cluster_size); + test_error_fail(error, "Unable to set dummy cluster size"); + } + + KernelExecutor<Ty, Fns> executor( + context, queue, kernel, global, local, idata.data(), + input_array_size * sizeof(Ty), mapin.data(), mapout.data(), + sgmap.data(), global * sizeof(cl_int4), odata.data(), + output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); + // Run the kernel once on zeroes to get the map memset(idata.data(), 0, input_array_size * sizeof(Ty)); - error = run_kernel(context, queue, kernel, global, local, idata.data(), - input_array_size * sizeof(Ty), sgmap.data(), - global * sizeof(cl_int4), odata.data(), - output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); - test_error(error, "Running kernel first time failed"); + error = executor.run(); + test_error_fail(error, "Running kernel first time failed"); // Generate the desired input for the kernel - test_params.subgroup_size = subgroup_size; Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params); - error = run_kernel(context, queue, kernel, global, local, idata.data(), - input_array_size * sizeof(Ty), sgmap.data(), - global * sizeof(cl_int4), odata.data(), - output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); - test_error(error, "Running kernel second time failed"); - - // Check the result - error = Fns::chk(idata.data(), odata.data(), mapin.data(), - mapout.data(), sgmap.data(), test_params); - test_error(error, "Data verification failed"); - return TEST_PASS; + + test_status status; + + if (test_params.divergence_mask_arg != -1) + { + for (auto &mask : test_params.all_work_item_masks) + { + test_params.work_items_mask = mask; + cl_uint4 mask_vector = bs128_to_cl_uint4(mask); + clSetKernelArg(kernel, test_params.divergence_mask_arg, + sizeof(cl_uint4), &mask_vector); + + status = executor.run_and_check(test_params); + + if (status == TEST_FAIL) break; + } + } + else + { + status = executor.run_and_check(test_params); + } + // Detailed failure and skip messages should be logged by + // run_and_check. + if (status == TEST_PASS) + { + Fns::log_test(test_params, " passed"); + } + else if (!executor.run_failed && status == TEST_FAIL) + { + test_fail("Data verification failed\n"); + } + return status; } }; @@ -1466,21 +1728,21 @@ struct RunTestForType num_elements_(num_elements), test_params_(test_params) {} template <typename T, typename U> - int run_impl(const char *kernel_name, const char *source) + int run_impl(const std::string &function_name) { int error = TEST_PASS; - if (test_params_.all_work_item_masks.size() > 0) - { - error = test<T, U>::mrun(device_, context_, queue_, num_elements_, - kernel_name, source, test_params_); - } - else - { - error = test<T, U>::run(device_, context_, queue_, num_elements_, - kernel_name, source, test_params_); - } - - return error; + std::string source = + std::regex_replace(test_params_.get_kernel_source(function_name), + std::regex("\\%s"), function_name); + std::string kernel_name = "test_" + function_name; + error = + test<T, U>::run(device_, context_, queue_, num_elements_, + kernel_name.c_str(), source.c_str(), test_params_); + + // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be + // reported as having been skipped even if some tests within it + // passed, as the status codes are erroneously ORed together: + return error == TEST_FAIL ? TEST_FAIL : TEST_PASS; } private: diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp index 47e42f65..fb93ddb1 100644 --- a/test_conformance/subgroups/test_barrier.cpp +++ b/test_conformance/subgroups/test_barrier.cpp @@ -59,6 +59,17 @@ static const char *gbar_source = // barrier test functions template <int Which> struct BAR { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + if (Which == 0) + log_info(" sub_group_barrier(CLK_LOCAL_MEM_FENCE)...%s\n", + extra_text); + else + log_info(" sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...%s\n", + extra_text); + } + static void gen(cl_int *x, cl_int *t, cl_int *m, const WorkGroupParams &test_params) { @@ -68,7 +79,6 @@ template <int Which> struct BAR int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; ng = ng / nw; - int e; ii = 0; for (k = 0; k < ng; ++k) @@ -92,8 +102,8 @@ template <int Which> struct BAR } } - static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, + cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; @@ -103,11 +113,6 @@ template <int Which> struct BAR ng = ng / nw; cl_int tr, rr; - if (Which == 0) - log_info(" sub_group_barrier(CLK_LOCAL_MEM_FENCE)...\n"); - else - log_info(" sub_group_barrier(CLK_GLOBAL_MEM_FENCE)...\n"); - for (k = 0; k < ng; ++k) { // Map to array indexed to array indexed by local ID and sub group @@ -133,7 +138,7 @@ template <int Which> struct BAR "id %d in sub group %d in group %d expected " "%d got %d\n", i, j, k, tr, rr); - return -1; + return TEST_FAIL; } } } @@ -143,7 +148,7 @@ template <int Which> struct BAR m += 2 * nw; } - return 0; + return TEST_PASS; } }; @@ -187,4 +192,4 @@ int test_barrier_functions_ext(cl_device_id device, cl_context context, } return test_barrier_functions(device, context, queue, num_elements, false); -}
\ No newline at end of file +} diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp index 428f2cdc..f2bd5b92 100644 --- a/test_conformance/subgroups/test_ifp.cpp +++ b/test_conformance/subgroups/test_ifp.cpp @@ -225,6 +225,12 @@ void run_insts(cl_int *x, cl_int *p, int n) struct IFP { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" independent forward progress...%s\n", extra_text); + } + static void gen(cl_int *x, cl_int *t, cl_int *, const WorkGroupParams &test_params) { @@ -245,8 +251,8 @@ struct IFP } } - static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, - const WorkGroupParams &test_params) + static test_status chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, + const WorkGroupParams &test_params) { int i, k; int nw = test_params.local_workgroup_size; @@ -255,10 +261,8 @@ struct IFP int nj = (nw + ns - 1) / ns; ng = ng / nw; - // We need at least 2 sub groups per group for this tes - if (nj == 1) return 0; - - log_info(" independent forward progress...\n"); + // We need at least 2 sub groups per group for this test + if (nj == 1) return TEST_SKIPPED_ITSELF; for (k = 0; k < ng; ++k) { @@ -270,14 +274,14 @@ struct IFP log_error( "ERROR: mismatch at element %d in work group %d\n", i, k); - return -1; + return TEST_FAIL; } } x += nj * (NUM_LOC + 1); y += NUM_LOC; } - return 0; + return TEST_PASS; } }; @@ -360,17 +364,21 @@ int test_ifp_ext(cl_device_id device, cl_context context, } // ifp only in subgroup functions tests: test_status error; - error = checkIFPSupport(device, ifpSupport); - if (error != TEST_PASS) - { - return error; - } - if (ifpSupport == false) + auto device_cl_version = get_device_cl_version(device); + if (device_cl_version >= Version(2, 1)) { - log_info( - "Error reason: the extension cl_khr_subgroups requires that " - "Independed forward progress has to be supported by device.\n"); - return TEST_FAIL; + error = checkIFPSupport(device, ifpSupport); + if (error != TEST_PASS) + { + return error; + } + if (ifpSupport == false) + { + log_info( + "Error reason: the extension cl_khr_subgroups requires that " + "Independed forward progress has to be supported by device.\n"); + return TEST_FAIL; + } } return test_ifp(device, context, queue, num_elements, false); -}
\ No newline at end of file +} diff --git a/test_conformance/subgroups/test_queries.cpp b/test_conformance/subgroups/test_queries.cpp index 761ca7a6..6b940935 100644 --- a/test_conformance/subgroups/test_queries.cpp +++ b/test_conformance/subgroups/test_queries.cpp @@ -100,7 +100,7 @@ int test_sub_group_info(cl_device_id device, cl_context context, subgroupsApiSet.clGetKernelSubGroupInfo_ptr(); if (clGetKernelSubGroupInfo_ptr == NULL) { - log_error("ERROR: %s function not available", + log_error("ERROR: %s function not available\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } @@ -112,7 +112,7 @@ int test_sub_group_info(cl_device_id device, cl_context context, if (error != CL_SUCCESS) { log_error("ERROR: %s function error for " - "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE", + "CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } @@ -133,7 +133,7 @@ int test_sub_group_info(cl_device_id device, cl_context context, if (error != CL_SUCCESS) { log_error("ERROR: %s function error " - "for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE", + "for CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE\n", subgroupsApiSet.clGetKernelSubGroupInfo_name); return TEST_FAIL; } @@ -209,4 +209,4 @@ int test_sub_group_info_ext(cl_device_id device, cl_context context, } return test_sub_group_info(device, context, queue, num_elements, false); -}
\ No newline at end of file +} diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp index c0e49524..75e9d4ae 100644 --- a/test_conformance/subgroups/test_subgroup.cpp +++ b/test_conformance/subgroups/test_subgroup.cpp @@ -24,6 +24,13 @@ namespace { // Any/All test functions template <NonUniformVoteOp operation> struct AA { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_%s...%s\n", operation_names(operation), + extra_text); + } + static void gen(cl_int *x, cl_int *t, cl_int *m, const WorkGroupParams &test_params) { @@ -35,7 +42,6 @@ template <NonUniformVoteOp operation> struct AA int e; ng = ng / nw; ii = 0; - log_info(" sub_group_%s...\n", operation_names(operation)); for (k = 0; k < ng; ++k) { for (j = 0; j < nj; ++j) @@ -68,8 +74,8 @@ template <NonUniformVoteOp operation> struct AA } } - static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, + cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, n; int ng = test_params.global_workgroup_size; @@ -124,51 +130,33 @@ template <NonUniformVoteOp operation> struct AA y += nw; m += 4 * nw; } - log_info(" sub_group_%s... passed\n", operation_names(operation)); return TEST_PASS; } }; -static const char *any_source = "__kernel void test_any(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_any(in[gid]);\n" - "}\n"; - -static const char *all_source = "__kernel void test_all(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_all(in[gid]);\n" - "}\n"; - - template <typename T> int run_broadcast_scan_reduction_for_type(RunTestForType rft) { int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>( - "test_bcast", bcast_source); - error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd", - redadd_source); - error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax", - redmax_source); - error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin", - redmin_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd", - scinadd_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax", - scinmax_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin", - scinmin_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd", - scexadd_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax", - scexmax_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin", - scexmin_source); + "sub_group_broadcast"); + error |= + rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add"); + error |= + rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max"); + error |= + rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>( + "sub_group_scan_inclusive_add"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>( + "sub_group_scan_inclusive_max"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>( + "sub_group_scan_inclusive_min"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>( + "sub_group_scan_exclusive_add"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>( + "sub_group_scan_exclusive_max"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>( + "sub_group_scan_exclusive_min"); return error; } @@ -181,11 +169,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context, constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_reduction_scan_source); + test_params.save_kernel_source(sub_group_generic_source, + "sub_group_broadcast"); + RunTestForType rft(device, context, queue, num_elements, test_params); int error = - rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source); - error |= - rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source); + rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any"); + error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all"); error |= run_broadcast_scan_reduction_for_type<cl_int>(rft); error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft); error |= run_broadcast_scan_reduction_for_type<cl_long>(rft); diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp index f2e4060b..3882311d 100644 --- a/test_conformance/subgroups/test_subgroup_ballot.cpp +++ b/test_conformance/subgroups/test_subgroup_ballot.cpp @@ -23,52 +23,101 @@ namespace { // Test for ballot functions template <typename Ty> struct BALLOT { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_ballot...%s\n", extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { - // no work here int gws = test_params.global_workgroup_size; int lws = test_params.local_workgroup_size; int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; int non_uniform_size = gws % lws; - log_info(" sub_group_ballot...\n"); - if (non_uniform_size) - { - log_info(" non uniform work group size mode ON\n"); + int wg_number = gws / lws; + wg_number = non_uniform_size ? wg_number + 1 : wg_number; + int last_subgroup_size = 0; + + for (int wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + for (int sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + int current_sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + + for (int wi_id = 0; wi_id < current_sbs; wi_id++) + { + cl_uint v; + if (genrand_bool(gMTdata)) + { + v = genrand_bool(gMTdata); + } + else if (genrand_bool(gMTdata)) + { + v = 1U << ((genrand_int32(gMTdata) % 31) + 1); + } + else + { + v = genrand_int32(gMTdata); + } + cl_uint4 v4 = { v, 0, 0, 0 }; + t[wi_id + wg_offset] = v4; + } + } + // Now map into work group using map from device + for (int wi_id = 0; wi_id < lws; ++wi_id) + { + x[wi_id] = t[wi_id]; + } + x += lws; + m += 4 * lws; } } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { - int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; int lws = test_params.local_workgroup_size; int sbs = test_params.subgroup_size; int sb_number = (lws + sbs - 1) / sbs; - int current_sbs = 0; - cl_uint expected_result, device_result; int non_uniform_size = gws % lws; int wg_number = gws / lws; wg_number = non_uniform_size ? wg_number + 1 : wg_number; int last_subgroup_size = 0; - for (wg_id = 0; wg_id < wg_number; ++wg_id) + for (int wg_id = 0; wg_id < wg_number; ++wg_id) { // for each work_group if (non_uniform_size && wg_id == wg_number - 1) { set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, last_subgroup_size); } - - for (wi_id = 0; wi_id < lws; ++wi_id) + for (int wi_id = 0; wi_id < lws; ++wi_id) { // inside the work_group - // read device outputs for work_group - my[wi_id] = y[wi_id]; + mx[wi_id] = x[wi_id]; // read host inputs for work_group + my[wi_id] = y[wi_id]; // read device outputs for work_group } - for (sb_id = 0; sb_id < sb_number; ++sb_id) + for (int sb_id = 0; sb_id < sb_number; ++sb_id) { // for each subgroup int wg_offset = sb_id * sbs; + int current_sbs; if (last_subgroup_size && sb_id == sb_number - 1) { current_sbs = last_subgroup_size; @@ -77,26 +126,54 @@ template <typename Ty> struct BALLOT { current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; } - for (wi_id = 0; wi_id < current_sbs; ++wi_id) + + bs128 expected_result_bs = 0; + + std::set<int> active_work_items; + for (int wi_id = 0; wi_id < current_sbs; ++wi_id) { - device_result = my[wg_offset + wi_id]; - expected_result = 1; - if (!compare(device_result, expected_result)) + if (test_params.work_items_mask.test(wi_id)) + { + bool predicate = (mx[wg_offset + wi_id].s0 != 0); + expected_result_bs |= (bs128(predicate) << wi_id); + active_work_items.insert(wi_id); + } + } + if (active_work_items.empty()) + { + continue; + } + + cl_uint4 expected_result = + bs128_to_cl_uint4(expected_result_bs); + for (const int &active_work_item : active_work_items) + { + int wi_id = active_work_item; + + cl_uint4 device_result = my[wg_offset + wi_id]; + bs128 device_result_bs = cl_uint4_to_bs128(device_result); + + if (device_result_bs != expected_result_bs) { log_error( "ERROR: sub_group_ballot mismatch for local id " - "%d in sub group %d in group %d obtained {%d}, " - "expected {%d} \n", - wi_id, sb_id, wg_id, device_result, - expected_result); + "%d in sub group %d in group %d obtained {%d, %d, " + "%d, %d}, expected {%d, %d, %d, %d}\n", + wi_id, sb_id, wg_id, device_result.s0, + device_result.s1, device_result.s2, + device_result.s3, expected_result.s0, + expected_result.s1, expected_result.s2, + expected_result.s3); return TEST_FAIL; } } } + + x += lws; y += lws; m += 4 * lws; } - log_info(" sub_group_ballot... passed\n"); + return TEST_PASS; } }; @@ -104,23 +181,22 @@ template <typename Ty> struct BALLOT // Test for bit extract ballot functions template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_ballot_%s(%s)...%s\n", operation_names(operation), + TypeManager<Ty>::name(), extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { - int wi_id, sb_id, wg_id, l; + int wi_id, sb_id, wg_id; int gws = test_params.global_workgroup_size; int lws = test_params.local_workgroup_size; int sbs = test_params.subgroup_size; int sb_number = (lws + sbs - 1) / sbs; int wg_number = gws / lws; int limit_sbs = sbs > 100 ? 100 : sbs; - int non_uniform_size = gws % lws; - log_info(" sub_group_%s(%s)...\n", operation_names(operation), - TypeManager<Ty>::name()); - - if (non_uniform_size) - { - log_info(" non uniform work group size mode ON\n"); - } for (wg_id = 0; wg_id < wg_number; ++wg_id) { // for each work_group @@ -155,10 +231,10 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT } } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { - int wi_id, wg_id, l, sb_id; + int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; int lws = test_params.local_workgroup_size; int sbs = test_params.subgroup_size; @@ -260,30 +336,25 @@ template <typename Ty, BallotOp operation> struct BALLOT_BIT_EXTRACT y += lws; m += 4 * lws; } - log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), - TypeManager<Ty>::name()); return TEST_PASS; } }; template <typename Ty, BallotOp operation> struct BALLOT_INVERSE { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_inverse_ballot...%s\n", extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { - int gws = test_params.global_workgroup_size; - int lws = test_params.local_workgroup_size; - int sbs = test_params.subgroup_size; - int non_uniform_size = gws % lws; - log_info(" sub_group_inverse_ballot...\n"); - if (non_uniform_size) - { - log_info(" non uniform work group size mode ON\n"); - } // no work here } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; @@ -322,9 +393,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE { current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; } - // take index of array where info which work_item will - // be broadcast its value is stored - int midx = 4 * wg_offset + 2; // take subgroup local id of this work_item // Check result for (wi_id = 0; wi_id < current_sbs; ++wi_id) @@ -354,7 +422,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE m += 4 * lws; } - log_info(" sub_group_inverse_ballot... passed\n"); return TEST_PASS; } }; @@ -363,6 +430,13 @@ template <typename Ty, BallotOp operation> struct BALLOT_INVERSE // Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_%s(%s)...%s\n", operation_names(operation), + TypeManager<Ty>::name(), extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int wi_id, wg_id, sb_id; @@ -375,14 +449,10 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND int last_subgroup_size = 0; int current_sbs = 0; - log_info(" sub_group_%s(%s)...\n", operation_names(operation), - TypeManager<Ty>::name()); if (non_uniform_size) { - log_info(" non uniform work group size mode ON\n"); wg_number++; } - int e; for (wg_id = 0; wg_id < wg_number; ++wg_id) { // for each work_group if (non_uniform_size && wg_id == wg_number - 1) @@ -423,7 +493,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND } else { - log_error("Unknown operation..."); + log_error("Unknown operation...\n"); } } @@ -451,15 +521,15 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND else if (operation == BallotOp::ballot_inclusive_scan || operation == BallotOp::ballot_exclusive_scan) { - for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i); - if (operation == BallotOp::ballot_exclusive_scan) - mask.reset(sub_group_local_id); + for (cl_uint i = 0; i < sub_group_local_id; ++i) mask.set(i); + if (operation == BallotOp::ballot_inclusive_scan) + mask.set(sub_group_local_id); } return mask; } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; @@ -469,7 +539,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND int non_uniform_size = gws % lws; int wg_number = gws / lws; wg_number = non_uniform_size ? wg_number + 1 : wg_number; - cl_uint4 expected_result, device_result; + cl_uint expected_result, device_result; int last_subgroup_size = 0; int current_sbs = 0; @@ -501,7 +571,7 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; } // Check result - expected_result = { 0, 0, 0, 0 }; + expected_result = 0; for (wi_id = 0; wi_id < current_sbs; ++wi_id) { // for subgroup element bs128 bs; @@ -510,34 +580,37 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND | (bs128(mx[wg_offset + wi_id].s1) << 32) | (bs128(mx[wg_offset + wi_id].s2) << 64) | (bs128(mx[wg_offset + wi_id].s3) << 96); - bs &= getImportantBits(wi_id, current_sbs); - device_result = my[wg_offset + wi_id]; + bs &= getImportantBits(wi_id, sbs); + device_result = my[wg_offset + wi_id].s0; if (operation == BallotOp::ballot_inclusive_scan || operation == BallotOp::ballot_exclusive_scan || operation == BallotOp::ballot_bit_count) { - expected_result.s0 = bs.count(); + expected_result = bs.count(); if (!compare(device_result, expected_result)) { log_error("ERROR: sub_group_%s " "mismatch for local id %d in sub group " - "%d in group %d obtained {%d, %d, %d, " - "%d}, expected {%d, %d, %d, %d}\n", + "%d in group %d obtained %d, " + "expected %d\n", operation_names(operation), wi_id, sb_id, - wg_id, device_result.s0, device_result.s1, - device_result.s2, device_result.s3, - expected_result.s0, expected_result.s1, - expected_result.s2, expected_result.s3); + wg_id, device_result, expected_result); return TEST_FAIL; } } else if (operation == BallotOp::ballot_find_lsb) { - for (int id = 0; id < current_sbs; ++id) + if (bs.none()) + { + // Return value is undefined when no bits are set, + // so skip validation: + continue; + } + for (int id = 0; id < sbs; ++id) { if (bs.test(id)) { - expected_result.s0 = id; + expected_result = id; break; } } @@ -545,23 +618,26 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND { log_error("ERROR: sub_group_ballot_find_lsb " "mismatch for local id %d in sub group " - "%d in group %d obtained {%d, %d, %d, " - "%d}, expected {%d, %d, %d, %d}\n", - wi_id, sb_id, wg_id, device_result.s0, - device_result.s1, device_result.s2, - device_result.s3, expected_result.s0, - expected_result.s1, expected_result.s2, - expected_result.s3); + "%d in group %d obtained %d, " + "expected %d\n", + wi_id, sb_id, wg_id, device_result, + expected_result); return TEST_FAIL; } } else if (operation == BallotOp::ballot_find_msb) { - for (int id = current_sbs - 1; id >= 0; --id) + if (bs.none()) + { + // Return value is undefined when no bits are set, + // so skip validation: + continue; + } + for (int id = sbs - 1; id >= 0; --id) { if (bs.test(id)) { - expected_result.s0 = id; + expected_result = id; break; } } @@ -569,13 +645,10 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND { log_error("ERROR: sub_group_ballot_find_msb " "mismatch for local id %d in sub group " - "%d in group %d obtained {%d, %d, %d, " - "%d}, expected {%d, %d, %d, %d}\n", - wi_id, sb_id, wg_id, device_result.s0, - device_result.s1, device_result.s2, - device_result.s3, expected_result.s0, - expected_result.s1, expected_result.s2, - expected_result.s3); + "%d in group %d obtained %d, " + "expected %d\n", + wi_id, sb_id, wg_id, device_result, + expected_result); return TEST_FAIL; } } @@ -585,8 +658,6 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND y += lws; m += 4 * lws; } - log_info(" sub_group_ballot_%s(%s)... passed\n", - operation_names(operation), TypeManager<Ty>::name()); return TEST_PASS; } }; @@ -594,15 +665,21 @@ template <typename Ty, BallotOp operation> struct BALLOT_COUNT_SCAN_FIND // test mask functions template <typename Ty, BallotOp operation> struct SMASK { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" get_sub_group_%s_mask...%s\n", operation_names(operation), + extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { - int wi_id, wg_id, l, sb_id; + int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; int lws = test_params.local_workgroup_size; int sbs = test_params.subgroup_size; int sb_number = (lws + sbs - 1) / sbs; int wg_number = gws / lws; - log_info(" get_sub_group_%s_mask...\n", operation_names(operation)); for (wg_id = 0; wg_id < wg_number; ++wg_id) { // for each work_group for (sb_id = 0; sb_id < sb_number; ++sb_id) @@ -631,8 +708,8 @@ template <typename Ty, BallotOp operation> struct SMASK } } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int wi_id, wg_id, sb_id; int gws = test_params.global_workgroup_size; @@ -678,245 +755,130 @@ template <typename Ty, BallotOp operation> struct SMASK y += lws; m += 4 * lws; } - log_info(" get_sub_group_%s_mask... passed\n", - operation_names(operation)); return TEST_PASS; } }; -static const char *bcast_non_uniform_source = - "__kernel void test_bcast_non_uniform(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" - " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n" - " } else {\n" - " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n" - " }\n" - "}\n"; - -static const char *bcast_first_source = - "__kernel void test_bcast_first(const __global Type *in, __global int4 " - "*xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" - " out[gid] = sub_group_broadcast_first(x);\n" - " } else {\n" - " out[gid] = sub_group_broadcast_first(x);\n" - " }\n" - "}\n"; - -static const char *ballot_bit_count_source = - "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_inclusive_scan_source = - "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_exclusive_scan_source = - "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_find_lsb_source = - "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_find_msb_source = - "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);" - " value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);" - " out[gid] = value ;" - "}\n"; - -static const char *get_subgroup_ge_mask_source = - "__kernel void test_get_sub_group_ge_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_ge_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_gt_mask_source = - "__kernel void test_get_sub_group_gt_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_gt_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_le_mask_source = - "__kernel void test_get_sub_group_le_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_le_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_lt_mask_source = - "__kernel void test_get_sub_group_lt_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_lt_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_eq_mask_source = - "__kernel void test_get_sub_group_eq_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_eq_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *ballot_source = - "__kernel void test_sub_group_ballot(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - "uint4 full_ballot = sub_group_ballot(1);\n" - "uint divergence_mask;\n" - "uint4 partial_ballot;\n" - "uint gid = get_global_id(0);" - "XY(xy,gid);\n" - "if (get_sub_group_local_id() & 1) {\n" - " divergence_mask = 0xaaaaaaaa;\n" - " partial_ballot = sub_group_ballot(1);\n" - "} else {\n" - " divergence_mask = 0x55555555;\n" - " partial_ballot = sub_group_ballot(1);\n" - "}\n" - " size_t lws = get_local_size(0);\n" - "uint4 masked_ballot = full_ballot;\n" - "masked_ballot.x &= divergence_mask;\n" - "masked_ballot.y &= divergence_mask;\n" - "masked_ballot.z &= divergence_mask;\n" - "masked_ballot.w &= divergence_mask;\n" - "out[gid] = all(masked_ballot == partial_ballot);\n" - - "} \n"; - -static const char *ballot_source_inverse = - "__kernel void test_sub_group_ballot_inverse(const __global " - "Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(10,0,0,0);\n" - " if (get_sub_group_local_id() & 1) {" - " uint4 partial_ballot_mask = " - "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);" - " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" - " value = (uint4)(1,0,0,1);\n" - " } else {\n" - " value = (uint4)(0,0,0,1);\n" - " }\n" - " } else {\n" - " uint4 partial_ballot_mask = " - "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);" - " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" - " value = (uint4)(1,0,0,2);\n" - " } else {\n" - " value = (uint4)(0,0,0,2);\n" - " }\n" - " }\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_bit_extract_source = - "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint index = xy[gid].z;\n" - " uint4 value = (uint4)(10,0,0,0);\n" - " if (get_sub_group_local_id() & 1) {" - " if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n" - " value = (uint4)(1,0,0,1);\n" - " } else {\n" - " value = (uint4)(0,0,0,1);\n" - " }\n" - " } else {\n" - " if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n" - " value = (uint4)(1,0,0,2);\n" - " } else {\n" - " value = (uint4)(0,0,0,2);\n" - " }\n" - " }\n" - " out[gid] = value;\n" - "}\n"; +std::string sub_group_non_uniform_broadcast_source = R"( +__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) { + out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z); + } else { + out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w); + } +} +)"; +std::string sub_group_broadcast_first_source = R"( +__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) { + out[gid] = sub_group_broadcast_first(x);; + } else { + out[gid] = sub_group_broadcast_first(x);; + } +} +)"; +std::string sub_group_ballot_bit_scan_find_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint4 value = (uint4)(0,0,0,0); + value = (uint4)(%s(x),0,0,0); + out[gid] = value; +} +)"; +std::string sub_group_ballot_mask_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + xy[gid].z = get_max_sub_group_size(); + Type x = in[gid]; + uint4 mask = %s(); + out[gid] = mask; +} +)"; +std::string sub_group_ballot_source = R"( +__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { + uint gid = get_global_id(0); + XY(xy,gid); + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if (subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; + } + uint4 value = (uint4)(0, 0, 0, 0); + if (elect_work_item & work_item_mask) { + value = sub_group_ballot(in[gid].s0); + } + out[gid] = value; +} +)"; +std::string sub_group_inverse_ballot_source = R"( +__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint4 value = (uint4)(10,0,0,0); + if (get_sub_group_local_id() & 1) { + uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA); + if (sub_group_inverse_ballot(partial_ballot_mask)) { + value = (uint4)(1,0,0,1); + } else { + value = (uint4)(0,0,0,1); + } + } else { + uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555); + if (sub_group_inverse_ballot(partial_ballot_mask)) { + value = (uint4)(1,0,0,2); + } else { + value = (uint4)(0,0,0,2); + } + } + out[gid] = value; +} +)"; +std::string sub_group_ballot_bit_extract_source = R"( + __kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint index = xy[gid].z; + uint4 value = (uint4)(10,0,0,0); + if (get_sub_group_local_id() & 1) { + if (sub_group_ballot_bit_extract(x, xy[gid].z)) { + value = (uint4)(1,0,0,1); + } else { + value = (uint4)(0,0,0,1); + } + } else { + if (sub_group_ballot_bit_extract(x, xy[gid].w)) { + value = (uint4)(1,0,0,2); + } else { + value = (uint4)(0,0,0,2); + } + } + out[gid] = value; +} +)"; template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft) { int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>( - "test_bcast_non_uniform", bcast_non_uniform_source); + "sub_group_non_uniform_broadcast"); return error; } @@ -926,11 +888,21 @@ template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft) int test_subgroup_functions_ballot(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { "cl_khr_subgroup_ballot" }; + if (!is_extension_available(device, "cl_khr_subgroup_ballot")) + { + log_info("cl_khr_subgroup_ballot is not supported on this device, " + "skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + constexpr size_t global_work_size = 170; constexpr size_t local_work_size = 64; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions); + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_ballot_mask_source); + test_params.save_kernel_source(sub_group_non_uniform_broadcast_source, + "sub_group_non_uniform_broadcast"); + test_params.save_kernel_source(sub_group_broadcast_first_source, + "sub_group_broadcast_first"); RunTestForType rft(device, context, queue, num_elements, test_params); // non uniform broadcast functions @@ -1014,76 +986,92 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context, // broadcast first functions error |= rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_uint, BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_long, BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_ulong, BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_short, BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_ushort, BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_char, BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_uchar, BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_float, BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl<cl_double, BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl< subgroups::cl_half, BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); // mask functions error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>( - "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source); + "get_sub_group_eq_mask"); error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>( - "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source); + "get_sub_group_ge_mask"); error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>( - "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source); + "get_sub_group_gt_mask"); error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>( - "test_get_sub_group_le_mask", get_subgroup_le_mask_source); + "get_sub_group_le_mask"); error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>( - "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source); - - // ballot functions - error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot", - ballot_source); - error |= rft.run_impl<cl_uint4, - BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>( - "test_sub_group_ballot_inverse", ballot_source_inverse); - error |= rft.run_impl< + "get_sub_group_lt_mask"); + + // sub_group_ballot function + WorkGroupParams test_params_ballot(global_work_size, local_work_size, 3); + test_params_ballot.save_kernel_source(sub_group_ballot_source); + RunTestForType rft_ballot(device, context, queue, num_elements, + test_params_ballot); + error |= + rft_ballot.run_impl<cl_uint4, BALLOT<cl_uint4>>("sub_group_ballot"); + + // ballot arithmetic functions + WorkGroupParams test_params_arith(global_work_size, local_work_size); + test_params_arith.save_kernel_source(sub_group_ballot_bit_scan_find_source); + test_params_arith.save_kernel_source(sub_group_inverse_ballot_source, + "sub_group_inverse_ballot"); + test_params_arith.save_kernel_source(sub_group_ballot_bit_extract_source, + "sub_group_ballot_bit_extract"); + RunTestForType rft_arith(device, context, queue, num_elements, + test_params_arith); + error |= + rft_arith.run_impl<cl_uint4, + BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>( + "sub_group_inverse_ballot"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>( - "test_sub_group_ballot_bit_extract", ballot_bit_extract_source); - error |= rft.run_impl< + "sub_group_ballot_bit_extract"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>( - "test_sub_group_ballot_bit_count", ballot_bit_count_source); - error |= rft.run_impl< + "sub_group_ballot_bit_count"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>( - "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source); - error |= rft.run_impl< + "sub_group_ballot_inclusive_scan"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>( - "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source); - error |= rft.run_impl< + "sub_group_ballot_exclusive_scan"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>( - "test_sub_group_ballot_find_lsb", ballot_find_lsb_source); - error |= rft.run_impl< + "sub_group_ballot_find_lsb"); + error |= rft_arith.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>( - "test_sub_group_ballot_find_msb", ballot_find_msb_source); + "sub_group_ballot_find_msb"); + return error; } diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp index 588e9cee..38652d51 100644 --- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp +++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp @@ -18,172 +18,55 @@ #include "subgroup_common_templates.h" #include "harness/typeWrappers.h" -#define CLUSTER_SIZE 4 -#define CLUSTER_SIZE_STR "4" - namespace { -static const char *redadd_clustered_source = - "__kernel void test_redadd_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmax_clustered_source = - "__kernel void test_redmax_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmin_clustered_source = - "__kernel void test_redmin_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmul_clustered_source = - "__kernel void test_redmul_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redand_clustered_source = - "__kernel void test_redand_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redor_clustered_source = - "__kernel void test_redor_clustered(const __global Type *in, __global int4 " - "*xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redxor_clustered_source = - "__kernel void test_redxor_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redand_clustered_logical_source = - "__kernel void test_redand_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - -static const char *redor_clustered_logical_source = - "__kernel void test_redor_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - -static const char *redxor_clustered_logical_source = - "__kernel void test_redxor_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if ( sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - +std::string sub_group_clustered_reduce_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, + uint cluster_size) { + Type r; + int gid = get_global_id(0); + XY(xy,gid); + xy[gid].w = 0; + Type v = in[gid]; + if (sizeof(in[gid]) == sizeof(%s(v, 1))) { + xy[gid].w = sizeof(in[gid]); + } + switch (cluster_size) { + case 1: r = %s(v, 1); break; + case 2: r = %s(v, 2); break; + case 4: r = %s(v, 4); break; + case 8: r = %s(v, 8); break; + case 16: r = %s(v, 16); break; + case 32: r = %s(v, 32); break; + case 64: r = %s(v, 64); break; + case 128: r = %s(v, 128); break; + } + out[gid] = r; +} +)"; // DESCRIPTION: // Test for reduce cluster functions template <typename Ty, ArithmeticOp operation> struct RED_CLU { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ...%s\n", + operation_names(operation), TypeManager<Ty>::name(), + sizeof(Ty), extra_text); + } + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; ng = ng / nw; - log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ...\n", - operation_names(operation), TypeManager<Ty>::name(), - sizeof(Ty)); - genrand<Ty, operation>(x, t, m, ns, nw, ng); + generate_inputs<Ty, operation>(x, t, m, ns, nw, ng); } - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; @@ -219,34 +102,34 @@ template <typename Ty, ArithmeticOp operation> struct RED_CLU { int ii = j * ns; int n = ii + ns > nw ? nw - ii : ns; - int midx = 4 * ii + 2; std::vector<Ty> clusters_results; - int clusters_counter = ns / CLUSTER_SIZE; + int clusters_counter = ns / test_params.cluster_size; clusters_results.resize(clusters_counter); // Compute target Ty tr = mx[ii]; for (int i = 0; i < n; ++i) { - if (i % CLUSTER_SIZE == 0) + if (i % test_params.cluster_size == 0) tr = mx[ii + i]; else tr = calculate<Ty>(tr, mx[ii + i], operation); - clusters_results[i / CLUSTER_SIZE] = tr; + clusters_results[i / test_params.cluster_size] = tr; } // Check result for (int i = 0; i < n; ++i) { Ty rr = my[ii + i]; - tr = clusters_results[i / CLUSTER_SIZE]; + tr = clusters_results[i / test_params.cluster_size]; if (!compare(rr, tr)) { log_error( - "ERROR: sub_group_clustered_reduce_%s(%s) mismatch " - "for local id %d in sub group %d in group %d\n", + "ERROR: sub_group_clustered_reduce_%s(%s, %u) " + "mismatch for local id %d in sub group %d in group " + "%d\n", operation_names(operation), TypeManager<Ty>::name(), - i, j, k); + test_params.cluster_size, i, j, k); return TEST_FAIL; } } @@ -256,9 +139,6 @@ template <typename Ty, ArithmeticOp operation> struct RED_CLU y += nw; m += 4 * nw; } - log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n", - operation_names(operation), TypeManager<Ty>::name(), - sizeof(Ty)); return TEST_PASS; } }; @@ -267,34 +147,34 @@ template <typename T> int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft) { int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>( - "test_redadd_clustered", redadd_clustered_source); + "sub_group_clustered_reduce_add"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>( - "test_redmax_clustered", redmax_clustered_source); + "sub_group_clustered_reduce_max"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>( - "test_redmin_clustered", redmin_clustered_source); + "sub_group_clustered_reduce_min"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>( - "test_redmul_clustered", redmul_clustered_source); + "sub_group_clustered_reduce_mul"); return error; } template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>( - "test_redand_clustered", redand_clustered_source); + "sub_group_clustered_reduce_and"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>( - "test_redor_clustered", redor_clustered_source); + "sub_group_clustered_reduce_or"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>( - "test_redxor_clustered", redxor_clustered_source); + "sub_group_clustered_reduce_xor"); return error; } template <typename T> int run_cluster_logical_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>( - "test_redand_clustered_logical", redand_clustered_logical_source); + "sub_group_clustered_reduce_logical_and"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>( - "test_redor_clustered_logical", redor_clustered_logical_source); + "sub_group_clustered_reduce_logical_or"); error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>( - "test_redxor_clustered_logical", redxor_clustered_logical_source); + "sub_group_clustered_reduce_logical_xor"); return error; } @@ -305,13 +185,17 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { - "cl_khr_subgroup_clustered_reduce" - }; + if (!is_extension_available(device, "cl_khr_subgroup_clustered_reduce")) + { + log_info("cl_khr_subgroup_clustered_reduce is not supported on this " + "device, skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions); + WorkGroupParams test_params(global_work_size, local_work_size, -1, 3); + test_params.save_kernel_source(sub_group_clustered_reduce_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft); diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp index 98401b8e..c9e6bb61 100644 --- a/test_conformance/subgroups/test_subgroup_extended_types.cpp +++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp @@ -24,30 +24,30 @@ namespace { template <typename T> int run_broadcast_for_extended_type(RunTestForType rft) { int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>( - "test_bcast", bcast_source); + "sub_group_broadcast"); return error; } template <typename T> int run_scan_reduction_for_type(RunTestForType rft) { - int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd", - redadd_source); - error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax", - redmax_source); - error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin", - redmin_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd", - scinadd_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax", - scinmax_source); - error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin", - scinmin_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd", - scexadd_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax", - scexmax_source); - error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin", - scexmin_source); + int error = + rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add"); + error |= + rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max"); + error |= + rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>( + "sub_group_scan_inclusive_add"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>( + "sub_group_scan_inclusive_max"); + error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>( + "sub_group_scan_inclusive_min"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>( + "sub_group_scan_exclusive_add"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>( + "sub_group_scan_exclusive_max"); + error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>( + "sub_group_scan_exclusive_min"); return error; } @@ -59,15 +59,21 @@ int test_subgroup_functions_extended_types(cl_device_id device, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { - "cl_khr_subgroup_extended_types" - }; + if (!is_extension_available(device, "cl_khr_subgroup_extended_types")) + { + log_info("cl_khr_subgroup_extended_types is not supported on this " + "device, skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions); - RunTestForType rft(device, context, queue, num_elements, test_params); + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_reduction_scan_source); + test_params.save_kernel_source(sub_group_generic_source, + "sub_group_broadcast"); + RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_broadcast_for_extended_type<cl_uint2>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft); error |= run_broadcast_for_extended_type<cl_uint4>(rft); @@ -102,22 +108,26 @@ int test_subgroup_functions_extended_types(cl_device_id device, error |= run_broadcast_for_extended_type<cl_double8>(rft); error |= run_broadcast_for_extended_type<cl_double16>(rft); + error |= run_broadcast_for_extended_type<cl_ushort>(rft); error |= run_broadcast_for_extended_type<cl_ushort2>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft); error |= run_broadcast_for_extended_type<cl_ushort4>(rft); error |= run_broadcast_for_extended_type<cl_ushort8>(rft); error |= run_broadcast_for_extended_type<cl_ushort16>(rft); + error |= run_broadcast_for_extended_type<cl_short>(rft); error |= run_broadcast_for_extended_type<cl_short2>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft); error |= run_broadcast_for_extended_type<cl_short4>(rft); error |= run_broadcast_for_extended_type<cl_short8>(rft); error |= run_broadcast_for_extended_type<cl_short16>(rft); + error |= run_broadcast_for_extended_type<cl_uchar>(rft); error |= run_broadcast_for_extended_type<cl_uchar2>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft); error |= run_broadcast_for_extended_type<cl_uchar4>(rft); error |= run_broadcast_for_extended_type<cl_uchar8>(rft); error |= run_broadcast_for_extended_type<cl_uchar16>(rft); + error |= run_broadcast_for_extended_type<cl_char>(rft); error |= run_broadcast_for_extended_type<cl_char2>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft); error |= run_broadcast_for_extended_type<cl_char4>(rft); diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp index eb46ff09..02fc507b 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp @@ -20,333 +20,25 @@ namespace { -static const char *scinadd_non_uniform_source = R"( - __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { +std::string sub_group_non_uniform_arithmetic_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]); - } - } -)"; - -static const char *scinmax_non_uniform_source = R"( - __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]); - } - } -)"; - -static const char *scinmin_non_uniform_source = R"( - __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]); - } - } -)"; - -static const char *scinmul_non_uniform_source = R"( - __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]); - } - } -)"; - -static const char *scinand_non_uniform_source = R"( - __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]); - } - } -)"; - -static const char *scinor_non_uniform_source = R"( - __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]); - } - } -)"; - -static const char *scinxor_non_uniform_source = R"( - __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]); - } - } -)"; - -static const char *scinand_non_uniform_logical_source = R"( - __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]); - } - } -)"; - -static const char *scinor_non_uniform_logical_source = R"( - __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]); - } - } -)"; - -static const char *scinxor_non_uniform_logical_source = R"( - __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]); - } - } -)"; - -static const char *scexadd_non_uniform_source = R"( - __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]); - } - } -)"; - -static const char *scexmax_non_uniform_source = R"( - __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]); - } - } -)"; - -static const char *scexmin_non_uniform_source = R"( - __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]); - } - } -)"; - -static const char *scexmul_non_uniform_source = R"( - __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]); - } - } -)"; - -static const char *scexand_non_uniform_source = R"( - __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]); - } - } -)"; - -static const char *scexor_non_uniform_source = R"( - __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]); - } - } -)"; - -static const char *scexxor_non_uniform_source = R"( - __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]); - } - } -)"; - -static const char *scexand_non_uniform_logical_source = R"( - __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]); - } - } -)"; - -static const char *scexor_non_uniform_logical_source = R"( - __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]); - } - } -)"; - -static const char *scexxor_non_uniform_logical_source = R"( - __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]); - } - } -)"; - -static const char *redadd_non_uniform_source = R"( - __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_add(in[gid]); - } - } -)"; - -static const char *redmax_non_uniform_source = R"( - __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_max(in[gid]); - } - } -)"; - -static const char *redmin_non_uniform_source = R"( - __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_min(in[gid]); - } - } -)"; - -static const char *redmul_non_uniform_source = R"( - __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_mul(in[gid]); - } - } -)"; - -static const char *redand_non_uniform_source = R"( - __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_and(in[gid]); - } - } -)"; - -static const char *redor_non_uniform_source = R"( - __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_or(in[gid]); - } - } -)"; - -static const char *redxor_non_uniform_source = R"( - __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_xor(in[gid]); - } - } -)"; - -static const char *redand_non_uniform_logical_source = R"( - __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]); - } - } -)"; - -static const char *redor_non_uniform_logical_source = R"( - __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]); - } - } -)"; - -static const char *redxor_non_uniform_logical_source = R"( - __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]); - } + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; + } + if (elect_work_item & work_item_mask){ + out[gid] = %s(in[gid]); + } } )"; @@ -354,52 +46,52 @@ template <typename T> int run_functions_add_mul_max_min_for_type(RunTestForType rft) { int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>( - "test_scinadd_non_uniform", scinadd_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_add"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>( - "test_scinmul_non_uniform", scinmul_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_mul"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>( - "test_scinmax_non_uniform", scinmax_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_max"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>( - "test_scinmin_non_uniform", scinmin_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_min"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>( - "test_scexadd_non_uniform", scexadd_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_add"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>( - "test_scexmul_non_uniform", scexmul_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_mul"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>( - "test_scexmax_non_uniform", scexmax_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_max"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>( - "test_scexmin_non_uniform", scexmin_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_min"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>( - "test_redadd_non_uniform", redadd_non_uniform_source); + "sub_group_non_uniform_reduce_add"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>( - "test_redmul_non_uniform", redmul_non_uniform_source); + "sub_group_non_uniform_reduce_mul"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>( - "test_redmax_non_uniform", redmax_non_uniform_source); + "sub_group_non_uniform_reduce_max"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>( - "test_redmin_non_uniform", redmin_non_uniform_source); + "sub_group_non_uniform_reduce_min"); return error; } template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>( - "test_scinand_non_uniform", scinand_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_and"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>( - "test_scinor_non_uniform", scinor_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_or"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>( - "test_scinxor_non_uniform", scinxor_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_xor"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>( - "test_scexand_non_uniform", scexand_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_and"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>( - "test_scexor_non_uniform", scexor_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_or"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>( - "test_scexxor_non_uniform", scexxor_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_xor"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>( - "test_redand_non_uniform", redand_non_uniform_source); + "sub_group_non_uniform_reduce_and"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>( - "test_redor_non_uniform", redor_non_uniform_source); + "sub_group_non_uniform_reduce_or"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>( - "test_redxor_non_uniform", redxor_non_uniform_source); + "sub_group_non_uniform_reduce_xor"); return error; } @@ -407,23 +99,23 @@ template <typename T> int run_functions_logical_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>( - "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_and"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>( - "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_or"); error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>( - "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_xor"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>( - "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_and"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>( - "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_or"); error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>( - "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_xor"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>( - "test_redand_non_uniform_logical", redand_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_and"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>( - "test_redor_non_uniform_logical", redor_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_or"); error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>( - "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_xor"); return error; } @@ -434,17 +126,18 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { - "cl_khr_subgroup_non_uniform_arithmetic" - }; - std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, - 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, - 0x00ffff00, 0x80000000, 0xaaaaaaaa }; + if (!is_extension_available(device, + "cl_khr_subgroup_non_uniform_arithmetic")) + { + log_info("cl_khr_subgroup_non_uniform_arithmetic is not supported on " + "this device, skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions, masks); + WorkGroupParams test_params(global_work_size, local_work_size, 3); + test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_functions_add_mul_max_min_for_type<cl_int>(rft); @@ -470,4 +163,4 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device, error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft); return error; -}
\ No newline at end of file +} diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp index 2b00b4dd..3be1ba30 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp @@ -22,31 +22,27 @@ namespace { template <typename T, NonUniformVoteOp operation> struct VOTE { + static void log_test(const WorkGroupParams &test_params, + const char *extra_text) + { + log_info(" sub_group_%s%s(%s)...%s\n", + (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_", + operation_names(operation), TypeManager<T>::name(), + extra_text); + } + static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params) { int i, ii, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; int non_uniform_size = ng % nw; ng = ng / nw; int last_subgroup_size = 0; ii = 0; - log_info(" sub_group_%s%s... \n", - (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_", - operation_names(operation)); - - log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x data type (%s)\n", - test_params.global_workgroup_size, nw, ns, work_items_mask, - TypeManager<T>::name()); - if (non_uniform_size) - { - log_info(" non uniform work group size mode ON\n"); - } if (operation == NonUniformVoteOp::elect) return; for (k = 0; k < ng; ++k) @@ -92,14 +88,13 @@ template <typename T, NonUniformVoteOp operation> struct VOTE } } - static int chk(T *x, T *y, T *mx, T *my, cl_int *m, - const WorkGroupParams &test_params) + static test_status chk(T *x, T *y, T *mx, T *my, cl_int *m, + const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; cl_int tr, rr; int non_uniform_size = ng % nw; @@ -141,8 +136,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE std::set<int> active_work_items; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (work_items_mask & check_work_item) + if (test_params.work_items_mask.test(i)) { active_work_items.insert(i); switch (operation) @@ -172,34 +166,28 @@ template <typename T, NonUniformVoteOp operation> struct VOTE } if (active_work_items.empty()) { - log_info(" no one workitem acitve... in workgroup id = %d " - "subgroup id = %d\n", - k, j); + continue; } - else + auto lowest_active = active_work_items.begin(); + for (const int &active_work_item : active_work_items) { - auto lowest_active = active_work_items.begin(); - for (const int &active_work_item : active_work_items) + i = active_work_item; + if (operation == NonUniformVoteOp::elect) { - i = active_work_item; - if (operation == NonUniformVoteOp::elect) - { - i == *lowest_active ? tr = 1 : tr = 0; - } + i == *lowest_active ? tr = 1 : tr = 0; + } - // normalize device values on host, non zero set 1. - rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1; + // normalize device values on host, non zero set 1. + rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1; - if (rr != tr) - { - log_error("ERROR: sub_group_%s() \n", - operation_names(operation)); - log_error( - "mismatch for work item %d sub group %d in " - "work group %d. Expected: %d Obtained: %d\n", - i, j, k, tr, rr); - return TEST_FAIL; - } + if (rr != tr) + { + log_error("ERROR: sub_group_%s() \n", + operation_names(operation)); + log_error("mismatch for work item %d sub group %d in " + "work group %d. Expected: %d Obtained: %d\n", + i, j, k, tr, rr); + return TEST_FAIL; } } } @@ -209,52 +197,50 @@ template <typename T, NonUniformVoteOp operation> struct VOTE m += 4 * nw; } - log_info(" sub_group_%s%s... passed\n", - (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_", - operation_names(operation)); return TEST_PASS; } }; -static const char *elect_source = R"( - __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_elect(); - } - } -)"; - -static const char *non_uniform_any_source = R"( - __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_any(in[gid]); - } - } -)"; -static const char *non_uniform_all_source = R"( - __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) { +std::string sub_group_elect_source = R"( + __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_all(in[gid]); - } + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; + } + if (elect_work_item & work_item_mask){ + out[gid] = sub_group_elect(); + } } )"; -static const char *non_uniform_all_equal_source = R"( - __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) { +std::string sub_group_non_uniform_any_all_all_equal_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_all_equal(in[gid]); + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; + } + if (elect_work_item & work_item_mask){ + out[gid] = %s(in[gid]); } } )"; @@ -262,7 +248,7 @@ static const char *non_uniform_all_equal_source = R"( template <typename T> int run_vote_all_equal_for_type(RunTestForType rft) { int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>( - "test_non_uniform_all_equal", non_uniform_all_equal_source); + "sub_group_non_uniform_all_equal"); return error; } } @@ -272,17 +258,19 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { - "cl_khr_subgroup_non_uniform_vote" - }; + if (!is_extension_available(device, "cl_khr_subgroup_non_uniform_vote")) + { + log_info("cl_khr_subgroup_non_uniform_vote is not supported on this " + "device, skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } - std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, - 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, - 0x00ffff00, 0x80000000 }; constexpr size_t global_work_size = 170; constexpr size_t local_work_size = 64; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions, masks); + WorkGroupParams test_params(global_work_size, local_work_size, 3); + test_params.save_kernel_source( + sub_group_non_uniform_any_all_all_equal_source); + test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect"); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_vote_all_equal_for_type<cl_int>(rft); @@ -294,10 +282,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device, error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft); error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>( - "test_non_uniform_all", non_uniform_all_source); + "sub_group_non_uniform_all"); error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>( - "test_elect", elect_source); + "sub_group_elect"); error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>( - "test_non_uniform_any", non_uniform_any_source); + "sub_group_non_uniform_any"); return error; } diff --git a/test_conformance/subgroups/test_subgroup_rotate.cpp b/test_conformance/subgroups/test_subgroup_rotate.cpp new file mode 100644 index 00000000..db0f48eb --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_rotate.cpp @@ -0,0 +1,109 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_kernels.h" +#include "subgroup_common_templates.h" +#include "harness/conversions.h" +#include "harness/typeWrappers.h" + +namespace { + +template <typename T> int run_rotate_for_type(RunTestForType rft) +{ + int error = rft.run_impl<T, SHF<T, ShuffleOp::rotate>>("sub_group_rotate"); + return error; +} + +std::string sub_group_clustered_rotate_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, + uint cluster_size) { + Type r; + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + int delta = xy[gid].z; + switch (cluster_size) { + case 1: r = %s(x, delta, 1); break; + case 2: r = %s(x, delta, 2); break; + case 4: r = %s(x, delta, 4); break; + case 8: r = %s(x, delta, 8); break; + case 16: r = %s(x, delta, 16); break; + case 32: r = %s(x, delta, 32); break; + case 64: r = %s(x, delta, 64); break; + case 128: r = %s(x, delta, 128); break; + } + out[gid] = r; + } +)"; + +template <typename T> int run_clustered_rotate_for_type(RunTestForType rft) +{ + int error = rft.run_impl<T, SHF<T, ShuffleOp::clustered_rotate>>( + "sub_group_clustered_rotate"); + return error; +} + +} + +int test_subgroup_functions_rotate(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + if (!is_extension_available(device, "cl_khr_subgroup_rotate")) + { + log_info("cl_khr_subgroup_rotate is not supported on this device, " + "skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_generic_source); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_rotate_for_type<cl_int>(rft); + error |= run_rotate_for_type<cl_uint>(rft); + error |= run_rotate_for_type<cl_long>(rft); + error |= run_rotate_for_type<cl_ulong>(rft); + error |= run_rotate_for_type<cl_short>(rft); + error |= run_rotate_for_type<cl_ushort>(rft); + error |= run_rotate_for_type<cl_char>(rft); + error |= run_rotate_for_type<cl_uchar>(rft); + error |= run_rotate_for_type<cl_float>(rft); + error |= run_rotate_for_type<cl_double>(rft); + error |= run_rotate_for_type<subgroups::cl_half>(rft); + + WorkGroupParams test_params_clustered(global_work_size, local_work_size, -1, + 3); + test_params_clustered.save_kernel_source(sub_group_clustered_rotate_source); + RunTestForType rft_clustered(device, context, queue, num_elements, + test_params_clustered); + + error |= run_clustered_rotate_for_type<cl_int>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_uint>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_long>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_ulong>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_short>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_ushort>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_char>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_uchar>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_float>(rft_clustered); + error |= run_clustered_rotate_for_type<cl_double>(rft_clustered); + error |= run_clustered_rotate_for_type<subgroups::cl_half>(rft_clustered); + + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp index 049f0982..56231cbf 100644 --- a/test_conformance/subgroups/test_subgroup_shuffle.cpp +++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp @@ -15,38 +15,19 @@ // #include "procs.h" #include "subhelpers.h" +#include "subgroup_common_kernels.h" #include "subgroup_common_templates.h" #include "harness/typeWrappers.h" #include <bitset> namespace { -static const char* shuffle_xor_source = - "__kernel void test_sub_group_shuffle_xor(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_xor(x, xy[gid].z);" - "}\n"; - -static const char* shuffle_source = - "__kernel void test_sub_group_shuffle(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle(x, xy[gid].z);" - "}\n"; - template <typename T> int run_shuffle_for_type(RunTestForType rft) { - int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>( - "test_sub_group_shuffle", shuffle_source); + int error = + rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle"); error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>( - "test_sub_group_shuffle_xor", shuffle_xor_source); + "sub_group_shuffle_xor"); return error; } @@ -55,11 +36,17 @@ template <typename T> int run_shuffle_for_type(RunTestForType rft) int test_subgroup_functions_shuffle(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" }; + if (!is_extension_available(device, "cl_khr_subgroup_shuffle")) + { + log_info("cl_khr_subgroup_shuffle is not supported on this device, " + "skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions); + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_generic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_shuffle_for_type<cl_int>(rft); diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp index 6000c970..caa1dccc 100644 --- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp +++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp @@ -15,37 +15,19 @@ // #include "procs.h" #include "subhelpers.h" +#include "subgroup_common_kernels.h" #include "subgroup_common_templates.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" namespace { -static const char* shuffle_down_source = - "__kernel void test_sub_group_shuffle_down(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_down(x, xy[gid].z);" - "}\n"; -static const char* shuffle_up_source = - "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_up(x, xy[gid].z);" - "}\n"; - template <typename T> int run_shuffle_relative_for_type(RunTestForType rft) { - int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>( - "test_sub_group_shuffle_up", shuffle_up_source); + int error = + rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up"); error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>( - "test_sub_group_shuffle_down", shuffle_down_source); + "sub_group_shuffle_down"); return error; } @@ -56,13 +38,17 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device, cl_command_queue queue, int num_elements) { - std::vector<std::string> required_extensions = { - "cl_khr_subgroup_shuffle_relative" - }; + if (!is_extension_available(device, "cl_khr_subgroup_shuffle_relative")) + { + log_info("cl_khr_subgroup_shuffle_relative is not supported on this " + "device, skipping test.\n"); + return TEST_SKIPPED_ITSELF; + } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, - required_extensions); + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_generic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_shuffle_relative_for_type<cl_int>(rft); diff --git a/test_conformance/subgroups/test_workitem.cpp b/test_conformance/subgroups/test_workitem.cpp index 7ffa6a7c..b69f3138 100644 --- a/test_conformance/subgroups/test_workitem.cpp +++ b/test_conformance/subgroups/test_workitem.cpp @@ -16,6 +16,7 @@ #include "procs.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" +#include <CL/cl.h> struct get_test_data { @@ -251,8 +252,21 @@ int test_work_item_functions(cl_device_id device, cl_context context, global = local * 5; - // Make sure we have a flexible range - global += 3 * local / 4; + // Non-uniform work-groups are an optional feature from 3.0 onward. + cl_bool device_supports_non_uniform_wg = CL_TRUE; + if (get_device_cl_version(device) >= Version(3, 0)) + { + error = clGetDeviceInfo( + device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), + &device_supports_non_uniform_wg, nullptr); + test_error(error, "clGetDeviceInfo failed"); + } + + if (device_supports_non_uniform_wg) + { + // Make sure we have a flexible range + global += 3 * local / 4; + } // Collect the data memset((void *)&result, 0xf0, sizeof(result)); @@ -327,4 +341,4 @@ int test_work_item_functions_ext(cl_device_id device, cl_context context, return test_work_item_functions(device, context, queue, num_elements, false); -}
\ No newline at end of file +} diff --git a/test_conformance/submission_details_template.txt b/test_conformance/submission_details_template.txt index 9d276a62..ff624837 100644 --- a/test_conformance/submission_details_template.txt +++ b/test_conformance/submission_details_template.txt @@ -81,6 +81,12 @@ Platform Version: # Tests version: +# Commit SHAs (7-digit) of any cherry-picked patches subsequent to tagged +# version. Any patches included must apply without conflicts to the tagged +# version in the order listed. +# +Patches: + # Implementations that support cl_khr_icd are required to use a loader to run # the tests and document the loader that was used. # diff --git a/test_conformance/vectors/test_step.cpp b/test_conformance/vectors/test_step.cpp index 2f6ad187..089bad2f 100644 --- a/test_conformance/vectors/test_step.cpp +++ b/test_conformance/vectors/test_step.cpp @@ -172,6 +172,8 @@ int test_step_internal(cl_device_id deviceID, cl_context context, destroyClState(pClState); return -1; } + + clStateDestroyProgramAndKernel(pClState); } } diff --git a/test_conformance/vulkan/CMakeLists.txt b/test_conformance/vulkan/CMakeLists.txt new file mode 100644 index 00000000..4f43172a --- /dev/null +++ b/test_conformance/vulkan/CMakeLists.txt @@ -0,0 +1,50 @@ +set (MODULE_NAME VULKAN) + +if(WIN32) + list(APPEND CLConform_LIBRARIES vulkan-1) +else(WIN32) + list(APPEND CLConform_LIBRARIES vulkan dl) +endif(WIN32) +set(CMAKE_CXX_FLAGS "-fpermissive") +if(WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DVK_USE_PLATFORM_WIN32_KHR") +endif(WIN32) + +set (CLConform_VULKAN_LIBRARIES_DIR "${VULKAN_LIB_DIR}") + +link_directories(${CLConform_VULKAN_LIBRARIES_DIR}) + +list(APPEND CLConform_INCLUDE_DIR ${VULKAN_INCLUDE_DIR}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +include_directories (${CLConform_INCLUDE_DIR}) + +set (${MODULE_NAME}_SOURCES + main.cpp + test_vulkan_interop_buffer.cpp + test_vulkan_interop_image.cpp + test_vulkan_api_consistency.cpp + test_vulkan_platform_device_info.cpp + vulkan_interop_common/vulkan_wrapper.cpp + vulkan_interop_common/vulkan_interop_common.cpp + vulkan_interop_common/opencl_vulkan_wrapper.cpp + vulkan_interop_common/vulkan_utility.cpp + vulkan_interop_common/vulkan_list_map.cpp + ../../test_common/harness/genericThread.cpp + ../../test_common/harness/errorHelpers.cpp + ../../test_common/harness/testHarness.cpp + ../../test_common/harness/kernelHelpers.cpp + ../../test_common/harness/mt19937.cpp + ../../test_common/harness/msvc9.c + ../../test_common/harness/parseParameters.cpp + ../../test_common/harness/deviceInfo.cpp + ../../test_common/harness/crc32.cpp + ) + +set_source_files_properties( + ${${MODULE_NAME}_SOURCES} + PROPERTIES LANGUAGE CXX) +include_directories("./vulkan_interop_common/") + +include(../CMakeCommon.txt) diff --git a/test_conformance/vulkan/main.cpp b/test_conformance/vulkan/main.cpp new file mode 100644 index 00000000..2eeb0c36 --- /dev/null +++ b/test_conformance/vulkan/main.cpp @@ -0,0 +1,346 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include <stdio.h> +#include <stdlib.h> + +#if !defined(_WIN32) +#include <stdbool.h> +#endif + +#include <math.h> +#include <string.h> + +#if !defined(__APPLE__) +#include <CL/cl.h> +#else +#include <OpenCL/cl.h> +#endif + + +#include "procs.h" +#include "harness/testHarness.h" +#include "harness/parseParameters.h" +#include "harness/deviceInfo.h" + +#if !defined(_WIN32) +#include <unistd.h> +#endif +#include <vulkan_interop_common.hpp> +#include <vulkan_wrapper.hpp> + +#define BUFFERSIZE 3000 + +static void params_reset() +{ + numCQ = 1; + multiImport = false; + multiCtx = false; +} + +extern int test_buffer_common(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_); +extern int test_image_common(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_); + +int test_buffer_single_queue(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + log_info("RUNNING TEST WITH ONE QUEUE...... \n\n"); + return test_buffer_common(device_, context_, queue_, numElements_); +} +int test_buffer_multiple_queue(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + numCQ = 2; + log_info("RUNNING TEST WITH TWO QUEUE...... \n\n"); + return test_buffer_common(device_, context_, queue_, numElements_); +} +int test_buffer_multiImport_sameCtx(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + multiImport = true; + log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT " + "IN SAME CONTEXT...... \n\n"); + return test_buffer_common(device_, context_, queue_, numElements_); +} +int test_buffer_multiImport_diffCtx(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + multiImport = true; + multiCtx = true; + log_info("RUNNING TEST WITH MULTIPLE DEVICE MEMORY IMPORT " + "IN DIFFERENT CONTEXT...... \n\n"); + return test_buffer_common(device_, context_, queue_, numElements_); +} +int test_image_single_queue(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + log_info("RUNNING TEST WITH ONE QUEUE...... \n\n"); + return test_image_common(device_, context_, queue_, numElements_); +} +int test_image_multiple_queue(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + params_reset(); + numCQ = 2; + log_info("RUNNING TEST WITH TWO QUEUE...... \n\n"); + return test_image_common(device_, context_, queue_, numElements_); +} + +test_definition test_list[] = { ADD_TEST(buffer_single_queue), + ADD_TEST(buffer_multiple_queue), + ADD_TEST(buffer_multiImport_sameCtx), + ADD_TEST(buffer_multiImport_diffCtx), + ADD_TEST(image_single_queue), + ADD_TEST(image_multiple_queue), + ADD_TEST(consistency_external_buffer), + ADD_TEST(consistency_external_image), + ADD_TEST(consistency_external_semaphore), + ADD_TEST(platform_info), + ADD_TEST(device_info) }; + +const int test_num = ARRAY_SIZE(test_list); + +cl_device_type gDeviceType = CL_DEVICE_TYPE_DEFAULT; +char *choosen_platform_name = NULL; +cl_platform_id platform = NULL; +cl_int choosen_platform_index = -1; +char platform_name[1024] = ""; +cl_platform_id select_platform = NULL; +char *extensions = NULL; +size_t extensionSize = 0; +cl_uint num_devices = 0; +cl_uint device_no = 0; +cl_device_id *devices; +const size_t bufsize = BUFFERSIZE; +char buf[BUFFERSIZE]; +cl_uchar uuid[CL_UUID_SIZE_KHR]; +unsigned int numCQ; +bool multiImport; +bool multiCtx; +bool debug_trace = false; +bool useSingleImageKernel = false; +bool useDeviceLocal = false; +bool disableNTHandleType = false; +bool enableOffset = false; +bool non_dedicated = false; + +static void printUsage(const char *execName) +{ + const char *p = strrchr(execName, '/'); + if (p != NULL) execName = p + 1; + + log_info("Usage: %s [test_names] [options]\n", execName); + log_info("Test names:\n"); + for (int i = 0; i < test_num; i++) + { + log_info("\t%s\n", test_list[i].name); + } + log_info("\n"); + log_info("Options:\n"); + log_info("\t--debug_trace - Enables additional debug info logging\n"); + log_info("\t--non_dedicated - Choose dedicated Vs. non_dedicated \n"); +} + +size_t parseParams(int argc, const char *argv[], const char **argList) +{ + size_t argCount = 1; + for (int i = 1; i < argc; i++) + { + if (argv[i] == NULL) break; + if (argv[i][0] == '-') + { + if (!strcmp(argv[i], "--debug_trace")) + { + debug_trace = true; + } + if (!strcmp(argv[i], "--useSingleImageKernel")) + { + useSingleImageKernel = true; + } + if (!strcmp(argv[i], "--useDeviceLocal")) + { + useDeviceLocal = true; + } + if (!strcmp(argv[i], "--disableNTHandleType")) + { + disableNTHandleType = true; + } + if (!strcmp(argv[i], "--enableOffset")) + { + enableOffset = true; + } + if (!strcmp(argv[i], "--non_dedicated")) + { + non_dedicated = true; + } + if (strcmp(argv[i], "-h") == 0) + { + printUsage(argv[0]); + argCount = 0; // Returning argCount=0 to assert error in main() + break; + } + } + else + { + argList[argCount] = argv[i]; + argCount++; + } + } + return argCount; +} + +int main(int argc, const char *argv[]) +{ + int errNum = 0; + + test_start(); + params_reset(); + + if (!checkVkSupport()) + { + log_info("Vulkan supported GPU not found \n"); + log_info("TEST SKIPPED \n"); + return 0; + } + + VulkanDevice vkDevice; + + cl_device_type requestedDeviceType = CL_DEVICE_TYPE_GPU; + char *force_cpu = getenv("CL_DEVICE_TYPE"); + if (force_cpu != NULL) + { + if (strcmp(force_cpu, "gpu") == 0 + || strcmp(force_cpu, "CL_DEVICE_TYPE_GPU") == 0) + requestedDeviceType = CL_DEVICE_TYPE_GPU; + else if (strcmp(force_cpu, "cpu") == 0 + || strcmp(force_cpu, "CL_DEVICE_TYPE_CPU") == 0) + requestedDeviceType = CL_DEVICE_TYPE_CPU; + else if (strcmp(force_cpu, "accelerator") == 0 + || strcmp(force_cpu, "CL_DEVICE_TYPE_ACCELERATOR") == 0) + requestedDeviceType = CL_DEVICE_TYPE_ACCELERATOR; + else if (strcmp(force_cpu, "CL_DEVICE_TYPE_DEFAULT") == 0) + requestedDeviceType = CL_DEVICE_TYPE_DEFAULT; + } + + if (requestedDeviceType != CL_DEVICE_TYPE_GPU) + { + log_info("Vulkan tests can only run on a GPU device.\n"); + return 0; + } + gDeviceType = CL_DEVICE_TYPE_GPU; + + const char **argList = (const char **)calloc(argc, sizeof(char *)); + size_t argCount = parseParams(argc, argv, argList); + if (argCount == 0) return 0; + // get the platform ID + errNum = clGetPlatformIDs(1, &platform, NULL); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "Error: Failed to get platform\n"); + return errNum; + } + + errNum = + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "clGetDeviceIDs failed in returning of devices\n"); + return errNum; + } + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + if (NULL == devices) + { + print_error(errNum, "Unable to allocate memory for devices\n"); + return CL_OUT_OF_HOST_MEMORY; + } + errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, + NULL); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "Failed to get deviceID.\n"); + return errNum; + } + for (device_no = 0; device_no < num_devices; device_no++) + { + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0, + NULL, &extensionSize); + if (CL_SUCCESS != errNum) + { + log_error("Error in clGetDeviceInfo for getting " + "device_extension size....\n"); + return errNum; + } + extensions = (char *)malloc(extensionSize); + if (NULL == extensions) + { + log_error("Unable to allocate memory for extensions\n"); + return CL_OUT_OF_HOST_MEMORY; + } + errNum = + clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, + extensionSize, extensions, NULL /*&extensionSize*/); + if (CL_SUCCESS != errNum) + { + print_error(errNum, + "Error in clGetDeviceInfo for getting " + "device_extension\n"); + return errNum; + } + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR, + CL_UUID_SIZE_KHR, uuid, &extensionSize); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "clGetDeviceInfo failed with error\n "); + return errNum; + } + errNum = + memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE); + if (errNum == 0) + { + break; + } + } + if (device_no >= num_devices) + { + fprintf(stderr, + "OpenCL error: " + "No Vulkan-OpenCL Interop capable GPU found.\n"); + } + if (!(is_extension_available(devices[device_no], "cl_khr_external_memory") + && is_extension_available(devices[device_no], + "cl_khr_external_semaphore"))) + { + log_info("Device does not support cl_khr_external_memory " + "or cl_khr_external_semaphore\n"); + log_info(" TEST SKIPPED\n"); + return CL_SUCCESS; + } + init_cl_vk_ext(platform); + + // Execute tests. + // Note: don't use the entire harness, because we have a different way of + // obtaining the device (via the context) + errNum = parseAndCallCommandLineTests(argCount, argList, devices[device_no], + test_num, test_list, true, 0, 1024); + return errNum; +} diff --git a/test_conformance/vulkan/procs.h b/test_conformance/vulkan/procs.h new file mode 100644 index 00000000..37bf7869 --- /dev/null +++ b/test_conformance/vulkan/procs.h @@ -0,0 +1,38 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "harness/mt19937.h" + +extern int test_vulkan_interop_buffer(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_vulkan_interop_image(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_consistency_external_buffer(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_consistency_external_image(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_consistency_external_semaphore(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_platform_info(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); +extern int test_device_info(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements); diff --git a/test_conformance/vulkan/shaders/buffer.comp b/test_conformance/vulkan/shaders/buffer.comp new file mode 100644 index 00000000..d8756f92 --- /dev/null +++ b/test_conformance/vulkan/shaders/buffer.comp @@ -0,0 +1,28 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable + +#define MAX_BUFFERS 5 + +layout(binding = 0) buffer Params +{ + uint32_t numBuffers; + uint32_t bufferSize; + uint32_t interBufferOffset; +}; +layout(binding = 1) buffer Buffer +{ + uint8_t ptr[]; +} bufferPtrList[MAX_BUFFERS]; +layout(local_size_x = 512) in; +void main() { + for (uint32_t bufIdx = 0; bufIdx < numBuffers; bufIdx++) { + uint32_t ptrIdx = gl_GlobalInvocationID.x; + uint32_t limit = bufferSize; + while (ptrIdx < limit) { + bufferPtrList[bufIdx].ptr[ptrIdx]++; + ptrIdx += (gl_NumWorkGroups.x * gl_WorkGroupSize.x); + } + } +}
\ No newline at end of file diff --git a/test_conformance/vulkan/shaders/buffer.spv b/test_conformance/vulkan/shaders/buffer.spv Binary files differnew file mode 100644 index 00000000..685523ba --- /dev/null +++ b/test_conformance/vulkan/shaders/buffer.spv diff --git a/test_conformance/vulkan/shaders/image2D.comp b/test_conformance/vulkan/shaders/image2D.comp new file mode 100644 index 00000000..42fa2f73 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D.comp @@ -0,0 +1,31 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : enable + +#define MAX_2D_IMAGES 5 +#define MAX_2D_IMAGE_MIP_LEVELS 11 +#define MAX_2D_IMAGE_DESCRIPTORS MAX_2D_IMAGES * MAX_2D_IMAGE_MIP_LEVELS + +layout(binding = 0) buffer Params +{ + uint32_t numImage2DDescriptors; +}; +layout(binding = 1, rgba32f ) uniform image2D image2DList[ MAX_2D_IMAGE_DESCRIPTORS ]; +layout(local_size_x = 32, local_size_y = 32) in; +void main() { + uvec3 numThreads = gl_NumWorkGroups * gl_WorkGroupSize; + for (uint32_t image2DIdx = 0; image2DIdx < numImage2DDescriptors; image2DIdx++) { + ivec2 imageDim = imageSize(image2DList[image2DIdx]); + uint32_t heightBy2 = imageDim.y / 2; + for (uint32_t row = gl_GlobalInvocationID.y; row < heightBy2; row += numThreads.y) { + for (uint32_t col = gl_GlobalInvocationID.x; col < imageDim.x; col += numThreads.x) { + ivec2 coordsA = ivec2(col, row); + ivec2 coordsB = ivec2(col, imageDim.y - row - 1); + vec4 dataA = imageLoad(image2DList[image2DIdx], coordsA); + vec4 dataB = imageLoad(image2DList[image2DIdx], coordsB); + imageStore(image2DList[image2DIdx], coordsA, dataB); + imageStore(image2DList[image2DIdx], coordsB, dataA); + } + } + } +}
\ No newline at end of file diff --git a/test_conformance/vulkan/shaders/image2D_r16i.spv b/test_conformance/vulkan/shaders/image2D_r16i.spv Binary files differnew file mode 100644 index 00000000..00c5c283 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r16i.spv diff --git a/test_conformance/vulkan/shaders/image2D_r16ui.spv b/test_conformance/vulkan/shaders/image2D_r16ui.spv Binary files differnew file mode 100644 index 00000000..87514d9f --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r16ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_r32f.spv b/test_conformance/vulkan/shaders/image2D_r32f.spv Binary files differnew file mode 100644 index 00000000..e82c9c19 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r32f.spv diff --git a/test_conformance/vulkan/shaders/image2D_r32i.spv b/test_conformance/vulkan/shaders/image2D_r32i.spv Binary files differnew file mode 100644 index 00000000..7ea8d26f --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r32i.spv diff --git a/test_conformance/vulkan/shaders/image2D_r32ui.spv b/test_conformance/vulkan/shaders/image2D_r32ui.spv Binary files differnew file mode 100644 index 00000000..dbcdbc5f --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r32ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_r8i.spv b/test_conformance/vulkan/shaders/image2D_r8i.spv Binary files differnew file mode 100644 index 00000000..1a641475 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r8i.spv diff --git a/test_conformance/vulkan/shaders/image2D_r8ui.spv b/test_conformance/vulkan/shaders/image2D_r8ui.spv Binary files differnew file mode 100644 index 00000000..a90ccf98 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_r8ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg16i.spv b/test_conformance/vulkan/shaders/image2D_rg16i.spv Binary files differnew file mode 100644 index 00000000..07996173 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg16i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg16ui.spv b/test_conformance/vulkan/shaders/image2D_rg16ui.spv Binary files differnew file mode 100644 index 00000000..f73e096b --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg16ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg32f.spv b/test_conformance/vulkan/shaders/image2D_rg32f.spv Binary files differnew file mode 100644 index 00000000..1489660e --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg32f.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg32i.spv b/test_conformance/vulkan/shaders/image2D_rg32i.spv Binary files differnew file mode 100644 index 00000000..b7d302f4 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg32i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg32ui.spv b/test_conformance/vulkan/shaders/image2D_rg32ui.spv Binary files differnew file mode 100644 index 00000000..6cf2f1b8 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg32ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg8i.spv b/test_conformance/vulkan/shaders/image2D_rg8i.spv Binary files differnew file mode 100644 index 00000000..a71b9bf0 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg8i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rg8ui.spv b/test_conformance/vulkan/shaders/image2D_rg8ui.spv Binary files differnew file mode 100644 index 00000000..2aca9290 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rg8ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba16i.spv b/test_conformance/vulkan/shaders/image2D_rgba16i.spv Binary files differnew file mode 100644 index 00000000..0cb95dfd --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba16i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba16ui.spv b/test_conformance/vulkan/shaders/image2D_rgba16ui.spv Binary files differnew file mode 100644 index 00000000..84c3d3db --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba16ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba32f.spv b/test_conformance/vulkan/shaders/image2D_rgba32f.spv Binary files differnew file mode 100644 index 00000000..35136c58 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba32f.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba32i.spv b/test_conformance/vulkan/shaders/image2D_rgba32i.spv Binary files differnew file mode 100644 index 00000000..4d1ae581 --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba32i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba32ui.spv b/test_conformance/vulkan/shaders/image2D_rgba32ui.spv Binary files differnew file mode 100644 index 00000000..bed86f0c --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba32ui.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba8i.spv b/test_conformance/vulkan/shaders/image2D_rgba8i.spv Binary files differnew file mode 100644 index 00000000..edf8c58c --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba8i.spv diff --git a/test_conformance/vulkan/shaders/image2D_rgba8ui.spv b/test_conformance/vulkan/shaders/image2D_rgba8ui.spv Binary files differnew file mode 100644 index 00000000..bb9a770c --- /dev/null +++ b/test_conformance/vulkan/shaders/image2D_rgba8ui.spv diff --git a/test_conformance/vulkan/test_vulkan_api_consistency.cpp b/test_conformance/vulkan/test_vulkan_api_consistency.cpp new file mode 100644 index 00000000..f22ac319 --- /dev/null +++ b/test_conformance/vulkan/test_vulkan_api_consistency.cpp @@ -0,0 +1,568 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include <vulkan_interop_common.hpp> +#include <opencl_vulkan_wrapper.hpp> +#include <vulkan_wrapper.hpp> +#if !defined(__APPLE__) +#include <CL/cl.h> +#include <CL/cl_ext.h> +#else +#include <OpenCL/cl.h> +#include <OpenCL/cl_ext.h> +#endif + +#include <assert.h> +#include <vector> +#include <iostream> +#include <string.h> +#include "harness/testHarness.h" +#include "harness/typeWrappers.h" +#include "harness/deviceInfo.h" + +int test_consistency_external_buffer(cl_device_id deviceID, cl_context _context, + cl_command_queue _queue, int num_elements) +{ + cl_int errNum; + VulkanDevice vkDevice; + // Context and command queue creation + cl_platform_id platform = NULL; + cl_context context = NULL; + cl_command_queue cmd_queue = NULL; + + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 }; + errNum = clGetPlatformIDs(1, &platform, NULL); + test_error(errNum, "Failed to get platform Id"); + + contextProperties[1] = (cl_context_properties)platform; + + context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &errNum); + test_error(errNum, "Unable to create context with properties"); + + cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum); + test_error(errNum, "Unable to create command queue"); + + uint32_t bufferSize = 32; + cl_device_id devList[] = { deviceID, NULL }; + +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_memory_win32")) + { + throw std::runtime_error("Device does not support " + "cl_khr_external_memory_win32 extension \n"); + } +#else + if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd")) + { + throw std::runtime_error( + "Device does not support " + "cl_khr_external_memory_opaque_fd extension \n"); + } +#endif + + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + getSupportedVulkanExternalMemoryHandleTypeList()[0]; + + VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, vkExternalMemoryHandleType); + const VulkanMemoryTypeList& memoryTypeList = + vkDummyBuffer.getMemoryTypeList(); + + VulkanDeviceMemory* vkDeviceMem = new VulkanDeviceMemory( + vkDevice, bufferSize, memoryTypeList[0], vkExternalMemoryHandleType); + VulkanBufferList vkBufferList(1, vkDevice, bufferSize, + vkExternalMemoryHandleType); + + vkDeviceMem->bindBuffer(vkBufferList[0], 0); + + void* handle = NULL; + int fd; + + std::vector<cl_mem_properties> extMemProperties{ + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR, + (cl_mem_properties)devList[0], + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR, + }; + cl_external_memory_handle_type_khr type; + switch (vkExternalMemoryHandleType) + { +#ifdef _WIN32 + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT: + handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType); + type = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR; + errNum = check_external_memory_handle_type(devList[0], type); + extMemProperties.push_back((cl_mem_properties)type); + extMemProperties.push_back((cl_mem_properties)handle); + break; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: + handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType); + type = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR; + errNum = check_external_memory_handle_type(devList[0], type); + extMemProperties.push_back((cl_mem_properties)type); + extMemProperties.push_back((cl_mem_properties)handle); + break; +#else + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: + fd = (int)vkDeviceMem->getHandle(vkExternalMemoryHandleType); + type = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR; + errNum = check_external_memory_handle_type(devList[0], type); + extMemProperties.push_back((cl_mem_properties)type); + extMemProperties.push_back((cl_mem_properties)fd); + break; +#endif + default: + errNum = TEST_FAIL; + log_error("Unsupported external memory handle type \n"); + break; + } + if (errNum != CL_SUCCESS) + { + log_error("Checks failed for " + "CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR\n"); + return TEST_FAIL; + } + extMemProperties.push_back(0); + + clMemWrapper buffer; + + // Passing NULL properties and a valid extMem_desc size + buffer = clCreateBufferWithProperties(context, NULL, 1, bufferSize, NULL, + &errNum); + test_error(errNum, "Unable to create buffer with NULL properties"); + + buffer.reset(); + + // Passing valid extMemProperties and buffersize + buffer = clCreateBufferWithProperties(context, extMemProperties.data(), 1, + bufferSize, NULL, &errNum); + test_error(errNum, "Unable to create buffer with Properties"); + + buffer.reset(); + + // Not passing external memory handle + std::vector<cl_mem_properties> extMemProperties2{ +#ifdef _WIN32 + (cl_mem_properties)type, + NULL, // Passing NULL handle +#else + (cl_mem_properties)type, + (cl_mem_properties)-64, // Passing random invalid fd +#endif + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR, + (cl_mem_properties)devList[0], + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR, + 0 + }; + buffer = clCreateBufferWithProperties(context, extMemProperties2.data(), 1, + bufferSize, NULL, &errNum); + test_failure_error(errNum, CL_INVALID_VALUE, + "Should return CL_INVALID_VALUE "); + + buffer.reset(); + + // Passing extMem_desc size = 0 but valid memProperties, CL_INVALID_SIZE + // should be returned. + buffer = clCreateBufferWithProperties(context, extMemProperties.data(), 1, + 0, NULL, &errNum); + test_failure_error(errNum, CL_INVALID_BUFFER_SIZE, + "Should return CL_INVALID_BUFFER_SIZE"); + + return TEST_PASS; +} + +int test_consistency_external_image(cl_device_id deviceID, cl_context _context, + cl_command_queue _queue, int num_elements) +{ + cl_int errNum; + VulkanDevice vkDevice; + + // Context and command queue creation + cl_platform_id platform = NULL; + cl_context context = NULL; + cl_command_queue cmd_queue = NULL; + + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 }; + errNum = clGetPlatformIDs(1, &platform, NULL); + test_error(errNum, "Failed to get platform id"); + + contextProperties[1] = (cl_context_properties)platform; + + context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &errNum); + test_error(errNum, "Unable to create context with properties"); + + cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum); + test_error(errNum, "Unable to create command queue"); + + cl_device_id devList[] = { deviceID, NULL }; + +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_memory_win32")) + { + throw std::runtime_error("Device does not support" + "cl_khr_external_memory_win32 extension \n"); + } +#else + if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_memory_opaque_fd " + "extension \n"); + } +#endif + uint32_t width = 256; + uint32_t height = 16; + cl_image_desc image_desc; + memset(&image_desc, 0x0, sizeof(cl_image_desc)); + cl_image_format img_format = { 0 }; + + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + getSupportedVulkanExternalMemoryHandleTypeList()[0]; + VulkanImage2D* vkImage2D = + new VulkanImage2D(vkDevice, VULKAN_FORMAT_R8G8B8A8_UNORM, width, height, + 1, vkExternalMemoryHandleType); + + const VulkanMemoryTypeList& memoryTypeList = vkImage2D->getMemoryTypeList(); + uint64_t totalImageMemSize = vkImage2D->getSize(); + + log_info("Memory type index: %lu\n", (uint32_t)memoryTypeList[0]); + log_info("Memory type property: %d\n", + memoryTypeList[0].getMemoryTypeProperty()); + log_info("Image size : %d\n", totalImageMemSize); + + VulkanDeviceMemory* vkDeviceMem = + new VulkanDeviceMemory(vkDevice, totalImageMemSize, memoryTypeList[0], + vkExternalMemoryHandleType); + vkDeviceMem->bindImage(*vkImage2D, 0); + + void* handle = NULL; + int fd; + std::vector<cl_mem_properties> extMemProperties{ + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR, + (cl_mem_properties)devList[0], + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR, + }; + switch (vkExternalMemoryHandleType) + { +#ifdef _WIN32 + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT: + handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType); + errNum = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties.push_back((cl_mem_properties)handle); + break; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: + handle = vkDeviceMem->getHandle(vkExternalMemoryHandleType); + errNum = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties.push_back( + (cl_mem_properties) + CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties.push_back((cl_mem_properties)handle); + break; +#else + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: + fd = (int)vkDeviceMem->getHandle(vkExternalMemoryHandleType); + errNum = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties.push_back((cl_mem_properties)fd); + break; +#endif + default: + errNum = TEST_FAIL; + log_error("Unsupported external memory handle type \n"); + break; + } + if (errNum != CL_SUCCESS) + { + log_error("Checks failed for " + "CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR\n"); + return TEST_FAIL; + } + extMemProperties.push_back(0); + + const VkImageCreateInfo VulkanImageCreateInfo = + vkImage2D->getVkImageCreateInfo(); + + errNum = getCLImageInfoFromVkImageInfo( + &VulkanImageCreateInfo, totalImageMemSize, &img_format, &image_desc); + if (errNum != CL_SUCCESS) + { + log_error("getCLImageInfoFromVkImageInfo failed!!!"); + return TEST_FAIL; + } + + clMemWrapper image; + + // Pass valid properties, image_desc and image_format + image = clCreateImageWithProperties( + context, extMemProperties.data(), CL_MEM_READ_WRITE, &img_format, + &image_desc, NULL /* host_ptr */, &errNum); + test_error(errNum, "Unable to create Image with Properties"); + image.reset(); + + // Passing properties, image_desc and image_format all as NULL + image = clCreateImageWithProperties(context, NULL, CL_MEM_READ_WRITE, NULL, + NULL, NULL, &errNum); + test_failure_error( + errNum, CL_INVALID_IMAGE_DESCRIPTOR, + "Image creation must fail with CL_INVALID_IMAGE_DESCRIPTOR " + "when all are passed as NULL"); + + image.reset(); + + // Passing NULL properties and a valid image_format and image_desc + image = + clCreateImageWithProperties(context, NULL, CL_MEM_READ_WRITE, + &img_format, &image_desc, NULL, &errNum); + test_error(errNum, + "Unable to create image with NULL properties " + "with valid image format and image desc"); + + image.reset(); + + // Passing image_format as NULL + image = clCreateImageWithProperties(context, extMemProperties.data(), + CL_MEM_READ_WRITE, NULL, &image_desc, + NULL, &errNum); + test_failure_error(errNum, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, + "Image creation must fail with " + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR" + "when image desc passed as NULL"); + + image.reset(); + + // Passing image_desc as NULL + image = clCreateImageWithProperties(context, extMemProperties.data(), + CL_MEM_READ_WRITE, &img_format, NULL, + NULL, &errNum); + test_failure_error(errNum, CL_INVALID_IMAGE_DESCRIPTOR, + "Image creation must fail with " + "CL_INVALID_IMAGE_DESCRIPTOR " + "when image desc passed as NULL"); + image.reset(); + + return TEST_PASS; +} + +int test_consistency_external_semaphore(cl_device_id deviceID, + cl_context _context, + cl_command_queue _queue, + int num_elements) +{ + cl_int errNum; + VulkanDevice vkDevice; + // Context and command queue creation + cl_platform_id platform = NULL; + cl_context context = NULL; + cl_command_queue cmd_queue = NULL; + + errNum = clGetPlatformIDs(1, &platform, NULL); + test_error(errNum, "Failed to get platform Id"); + + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 }; + + contextProperties[1] = (cl_context_properties)platform; + + context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &errNum); + test_error(errNum, "Unable to create context with properties"); + + cmd_queue = clCreateCommandQueue(context, deviceID, 0, &errNum); + test_error(errNum, "Unable to create command queue"); + + cl_device_id devList[] = { deviceID, NULL }; + +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_semaphore_win32 " + "extension \n"); + } +#else + if (!is_extension_available(devList[0], + "cl_khr_external_semaphore_opaque_fd")) + { + throw std::runtime_error( + "Device does not support " + "cl_khr_external_semaphore_opaque_fd extension \n"); + } +#endif + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2Clsemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2Vksemaphore(vkDevice, vkExternalSemaphoreHandleType); + cl_semaphore_khr clCl2Vksemaphore; + cl_semaphore_khr clVk2Clsemaphore; + + void* handle1 = NULL; + void* handle2 = NULL; + int fd1, fd2; + std::vector<cl_semaphore_properties_khr> sema_props1{ + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR, + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR, + }; + std::vector<cl_semaphore_properties_khr> sema_props2{ + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR, + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR, + }; + switch (vkExternalSemaphoreHandleType) + { +#ifdef _WIN32 + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT: + log_info(" Opaque NT handles are only supported on Windows\n"); + handle1 = vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType); + handle2 = vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType); + errNum = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR); + sema_props1.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR); + sema_props1.push_back((cl_semaphore_properties_khr)handle1); + sema_props2.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR); + sema_props2.push_back((cl_semaphore_properties_khr)handle2); + break; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT: + log_info(" Opaque D3DKMT handles are only supported on Windows\n"); + handle1 = vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType); + handle2 = vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType); + errNum = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR); + sema_props1.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR); + sema_props1.push_back((cl_semaphore_properties_khr)handle1); + sema_props2.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR); + sema_props2.push_back((cl_semaphore_properties_khr)handle2); + break; +#else + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: + log_info(" Opaque file descriptors are not supported on Windows\n"); + fd1 = + (int)vkVk2Clsemaphore.getHandle(vkExternalSemaphoreHandleType); + fd2 = + (int)vkCl2Vksemaphore.getHandle(vkExternalSemaphoreHandleType); + errNum = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR); + sema_props1.push_back( + (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR); + sema_props1.push_back((cl_semaphore_properties_khr)fd1); + sema_props2.push_back( + (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR); + sema_props2.push_back((cl_semaphore_properties_khr)fd2); + break; +#endif + default: log_error("Unsupported external memory handle type\n"); break; + } + if (CL_SUCCESS != errNum) + { + throw std::runtime_error( + "Unsupported external sempahore handle type\n "); + } + sema_props1.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR); + sema_props1.push_back((cl_semaphore_properties_khr)devList[0]); + sema_props1.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR); + sema_props2.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR); + sema_props2.push_back((cl_semaphore_properties_khr)devList[0]); + sema_props2.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR); + sema_props1.push_back(0); + sema_props2.push_back(0); + + // Pass NULL properties + cl_semaphore_khr cl_ext_semaphore = + clCreateSemaphoreWithPropertiesKHRptr(context, NULL, &errNum); + test_failure_error(errNum, CL_INVALID_VALUE, + "Semaphore creation must fail with CL_INVALID_VALUE " + " when properties are passed as NULL"); + + + // Pass invalid semaphore object to wait + errNum = + clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, NULL, NULL, 0, NULL, NULL); + test_failure_error(errNum, CL_INVALID_VALUE, + "clEnqueueWaitSemaphoresKHR fails with CL_INVALID_VALUE " + "when invalid semaphore object is passed"); + + + // Pass invalid semaphore object to signal + errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, NULL, NULL, 0, NULL, + NULL); + test_failure_error( + errNum, CL_INVALID_VALUE, + "clEnqueueSignalSemaphoresKHR fails with CL_INVALID_VALUE" + "when invalid semaphore object is passed"); + + + // Create two semaphore objects + clVk2Clsemaphore = clCreateSemaphoreWithPropertiesKHRptr( + context, sema_props1.data(), &errNum); + test_error(errNum, + "Unable to create semaphore with valid semaphore properties"); + + clCl2Vksemaphore = clCreateSemaphoreWithPropertiesKHRptr( + context, sema_props2.data(), &errNum); + test_error(errNum, + "Unable to create semaphore with valid semaphore properties"); + + + // Call Signal twice consecutively + errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &clVk2Clsemaphore, + NULL, 0, NULL, NULL); + test_error(errNum, "clEnqueueSignalSemaphoresKHRptr failed"); + + errNum = clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &clCl2Vksemaphore, + NULL, 0, NULL, NULL); + test_error(errNum, + "clEnqueueSignalSemaphoresKHRptr failed for two " + "consecutive wait events"); + + + // Call Wait twice consecutively + errNum = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &clVk2Clsemaphore, + NULL, 0, NULL, NULL); + test_error(errNum, "clEnqueueWaitSemaphoresKHRptr failed"); + + errNum = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &clCl2Vksemaphore, + NULL, 0, NULL, NULL); + test_error(errNum, + "clEnqueueWaitSemaphoresKHRptr failed for two " + " consecutive wait events"); + + + // Pass invalid object to release call + errNum = clReleaseSemaphoreKHRptr(NULL); + test_failure_error(errNum, CL_INVALID_VALUE, + "clReleaseSemaphoreKHRptr fails with " + "CL_INVALID_VALUE when NULL semaphore object is passed"); + + // Release both semaphore objects + errNum = clReleaseSemaphoreKHRptr(clVk2Clsemaphore); + test_error(errNum, "clReleaseSemaphoreKHRptr failed"); + + errNum = clReleaseSemaphoreKHRptr(clCl2Vksemaphore); + test_error(errNum, "clReleaseSemaphoreKHRptr failed"); + + return TEST_PASS; +} diff --git a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp new file mode 100644 index 00000000..9b0bc9de --- /dev/null +++ b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp @@ -0,0 +1,1786 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include <vulkan_interop_common.hpp> +#include <vulkan_wrapper.hpp> +#include <CL/cl.h> +#include <CL/cl_ext.h> +#include <assert.h> +#include <vector> +#include <iostream> +#include <string.h> +#include "harness/errorHelpers.h" + +#define MAX_BUFFERS 5 +#define MAX_IMPORTS 5 +#define BUFFERSIZE 3000 +static cl_uchar uuid[CL_UUID_SIZE_KHR]; +static cl_device_id deviceId = NULL; + +namespace { +struct Params +{ + uint32_t numBuffers; + uint32_t bufferSize; + uint32_t interBufferOffset; +}; +} + +const char *kernel_text_numbuffer_1 = " \ +__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a) { \n\ + int gid = get_global_id(0); \n\ + if (gid < bufferSize) { \n\ + a[gid]++; \n\ + } \n\ +}"; + +const char *kernel_text_numbuffer_2 = " \ +__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a, __global unsigned char *b) { \n\ + int gid = get_global_id(0); \n\ + if (gid < bufferSize) { \n\ + a[gid]++; \n\ + b[gid]++;\n\ + } \n\ +}"; + +const char *kernel_text_numbuffer_4 = " \ +__kernel void clUpdateBuffer(int bufferSize, __global unsigned char *a, __global unsigned char *b, __global unsigned char *c, __global unsigned char *d) { \n\ + int gid = get_global_id(0); \n\ + if (gid < bufferSize) { \n\ + a[gid]++;\n\ + b[gid]++; \n\ + c[gid]++; \n\ + d[gid]++; \n\ + } \n\ +}"; + + +const char *kernel_text_verify = " \ +__kernel void checkKernel(__global unsigned char *ptr, int size, int expVal, __global unsigned char *err) \n\ +{ \n\ + int idx = get_global_id(0); \n\ + if ((idx < size) && (*err == 0)) { \n\ + if (ptr[idx] != expVal){ \n\ + *err = 1; \n\ + } \n\ + } \n\ +}"; + +int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1, + cl_command_queue &cmd_queue2, cl_kernel *kernel, + cl_kernel &verify_kernel, VulkanDevice &vkDevice, + uint32_t numBuffers, uint32_t bufferSize) +{ + int err = CL_SUCCESS; + size_t global_work_size[1]; + uint8_t *error_2; + cl_mem error_1; + cl_kernel update_buffer_kernel; + cl_kernel kernel_cq; + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + const char *program_source_const = kernel_text_numbuffer_2; + size_t program_source_length = strlen(program_source_const); + cl_program program = clCreateProgramWithSource( + context, 1, &program_source_const, &program_source_length, &err); + err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed to build program \n"); + return err; + } + // create the kernel + kernel_cq = clCreateKernel(program, "clUpdateBuffer", &err); + if (err != CL_SUCCESS) + { + print_error(err, "clCreateKernel failed \n"); + return err; + } + + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + + VulkanQueue &vkQueue = vkDevice.getQueue(); + + std::vector<char> vkBufferShader = readFile("buffer.spv"); + + VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader); + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkBufferShaderModule); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + + const uint32_t maxIter = innerIterations; + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool); + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory; + std::vector<clExternalMemory *> externalMemory; + for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type: %d\n", + vkExternalMemoryHandleType); + + VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, + vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyBuffer.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + const VulkanMemoryType &memoryType = memoryTypeList[mtIdx]; + + log_info("Memory type index: %d\n", (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + + VulkanBufferList vkBufferList(numBuffers, vkDevice, bufferSize, + vkExternalMemoryHandleType); + + for (size_t bIdx = 0; bIdx < numBuffers; bIdx++) + { + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, bufferSize, memoryType, + vkExternalMemoryHandleType)); + externalMemory.push_back(new clExternalMemory( + vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType, + 0, bufferSize, context, deviceId)); + } + cl_mem buffers[MAX_BUFFERS]; + clFinish(cmd_queue1); + Params *params = (Params *)vkParamsDeviceMemory.map(); + params->numBuffers = numBuffers; + params->bufferSize = bufferSize; + params->interBufferOffset = 0; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++) + { + size_t buffer_size = vkBufferList[bIdx].getSize(); + vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx], + 0); + buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer(); + vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]); + } + vkCommandBuffer.begin(); + vkCommandBuffer.bindPipeline(vkComputePipeline); + vkCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, vkDescriptorSet); + vkCommandBuffer.dispatch(512, 1, 1); + vkCommandBuffer.end(); + + if (vkBufferList.size() == 2) + { + update_buffer_kernel = kernel[0]; + } + else if (vkBufferList.size() == 3) + { + update_buffer_kernel = kernel[1]; + } + else if (vkBufferList.size() == 5) + { + update_buffer_kernel = kernel[2]; + } + // global work size should be less than or equal to + // bufferSizeList[i] + global_work_size[0] = bufferSize; + for (uint32_t iter = 0; iter < maxIter; iter++) + { + + if (iter == 0) + { + vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore); + } + else + { + vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer, + vkVk2CLSemaphore); + } + clVk2CLExternalSemaphore->wait(cmd_queue1); + + err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t), + (void *)&bufferSize); + err |= clSetKernelArg(kernel_cq, 0, sizeof(uint32_t), + (void *)&bufferSize); + err |= clSetKernelArg(kernel_cq, 1, sizeof(cl_mem), + (void *)&(buffers[0])); + + for (int i = 0; i < vkBufferList.size() - 1; i++) + { + err |= + clSetKernelArg(update_buffer_kernel, i + 1, + sizeof(cl_mem), (void *)&(buffers[i])); + } + + err |= + clSetKernelArg(kernel_cq, 2, sizeof(cl_mem), + (void *)&(buffers[vkBufferList.size() - 1])); + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for kernel\n"); + goto CLEANUP; + } + cl_event first_launch; + + err = clEnqueueNDRangeKernel(cmd_queue1, update_buffer_kernel, + 1, NULL, global_work_size, NULL, 0, + NULL, &first_launch); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch update_buffer_kernel," + "error\n"); + goto CLEANUP; + } + + err = clEnqueueNDRangeKernel(cmd_queue2, kernel_cq, 1, NULL, + global_work_size, NULL, 1, + &first_launch, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch update_buffer_kernel," + "error\n"); + goto CLEANUP; + } + + if (iter != (maxIter - 1)) + { + clCl2VkExternalSemaphore->signal(cmd_queue2); + } + } + error_2 = (uint8_t *)malloc(sizeof(uint8_t)); + if (NULL == error_2) + { + log_error("Not able to allocate memory\n"); + goto CLEANUP; + } + clFinish(cmd_queue2); + error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + sizeof(uint8_t), NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clCreateBuffer \n"); + goto CLEANUP; + } + uint8_t val = 0; + err = clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), &val, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error\n"); + goto CLEANUP; + } + + int calc_max_iter; + for (int i = 0; i < vkBufferList.size(); i++) + { + if (i == 0) + calc_max_iter = (maxIter * 3); + else + calc_max_iter = (maxIter * 2); + err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem), + (void *)&(buffers[i])); + err |= + clSetKernelArg(verify_kernel, 1, sizeof(int), &bufferSize); + err |= clSetKernelArg(verify_kernel, 2, sizeof(int), + &calc_max_iter); + err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem), + (void *)&error_1); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "verify_kernel \n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, NULL, + global_work_size, NULL, 0, NULL, + NULL); + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch verify_kernel," + "error \n"); + goto CLEANUP; + } + err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), error_2, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error \n "); + goto CLEANUP; + } + if (*error_2 == 1) + { + log_error("&&&& vulkan_opencl_buffer test FAILED\n"); + goto CLEANUP; + } + } + for (size_t i = 0; i < vkBufferList.size(); i++) + { + delete vkBufferListDeviceMemory[i]; + delete externalMemory[i]; + } + vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(), + vkBufferListDeviceMemory.begin() + + numBuffers); + externalMemory.erase(externalMemory.begin(), + externalMemory.begin() + numBuffers); + } + } +CLEANUP: + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + if (vkBufferListDeviceMemory[i]) + { + delete vkBufferListDeviceMemory[i]; + } + if (externalMemory[i]) + { + delete externalMemory[i]; + } + } + if (program) clReleaseProgram(program); + if (kernel_cq) clReleaseKernel(kernel_cq); + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + if (error_2) free(error_2); + if (error_1) clReleaseMemObject(error_1); + + return err; +} + +int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1, + cl_kernel *kernel, cl_kernel &verify_kernel, + VulkanDevice &vkDevice, uint32_t numBuffers, + uint32_t bufferSize) +{ + log_info("RUNNING TEST WITH ONE QUEUE...... \n\n"); + size_t global_work_size[1]; + uint8_t *error_2; + cl_mem error_1; + cl_kernel update_buffer_kernel; + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + int err = CL_SUCCESS; + + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + + VulkanQueue &vkQueue = vkDevice.getQueue(); + + std::vector<char> vkBufferShader = readFile("buffer.spv"); + VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader); + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkBufferShaderModule); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + const uint32_t maxIter = innerIterations; + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool); + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory; + std::vector<clExternalMemory *> externalMemory; + + for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type: %d\n", + vkExternalMemoryHandleType); + + VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, + vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyBuffer.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + const VulkanMemoryType &memoryType = memoryTypeList[mtIdx]; + + log_info("Memory type index: %d\n", (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + + VulkanBufferList vkBufferList(numBuffers, vkDevice, bufferSize, + vkExternalMemoryHandleType); + + for (size_t bIdx = 0; bIdx < numBuffers; bIdx++) + { + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, bufferSize, memoryType, + vkExternalMemoryHandleType)); + externalMemory.push_back(new clExternalMemory( + vkBufferListDeviceMemory[bIdx], vkExternalMemoryHandleType, + 0, bufferSize, context, deviceId)); + } + cl_mem buffers[4]; + clFinish(cmd_queue1); + Params *params = (Params *)vkParamsDeviceMemory.map(); + params->numBuffers = numBuffers; + params->bufferSize = bufferSize; + params->interBufferOffset = 0; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++) + { + size_t buffer_size = vkBufferList[bIdx].getSize(); + vkBufferListDeviceMemory[bIdx]->bindBuffer(vkBufferList[bIdx], + 0); + buffers[bIdx] = externalMemory[bIdx]->getExternalMemoryBuffer(); + vkDescriptorSet.update((uint32_t)bIdx + 1, vkBufferList[bIdx]); + } + vkCommandBuffer.begin(); + vkCommandBuffer.bindPipeline(vkComputePipeline); + vkCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, vkDescriptorSet); + vkCommandBuffer.dispatch(512, 1, 1); + vkCommandBuffer.end(); + + if (vkBufferList.size() == 1) + { + update_buffer_kernel = kernel[0]; + } + else if (vkBufferList.size() == 2) + { + update_buffer_kernel = kernel[1]; + } + else if (vkBufferList.size() == 4) + { + update_buffer_kernel = kernel[2]; + } + + // global work size should be less than or equal to + // bufferSizeList[i] + global_work_size[0] = bufferSize; + + for (uint32_t iter = 0; iter < maxIter; iter++) + { + if (iter == 0) + { + vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore); + } + else + { + vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer, + vkVk2CLSemaphore); + } + clVk2CLExternalSemaphore->wait(cmd_queue1); + + err = clSetKernelArg(update_buffer_kernel, 0, sizeof(uint32_t), + (void *)&bufferSize); + for (int i = 0; i < vkBufferList.size(); i++) + { + err |= + clSetKernelArg(update_buffer_kernel, i + 1, + sizeof(cl_mem), (void *)&(buffers[i])); + } + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for kernel\n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue1, update_buffer_kernel, + 1, NULL, global_work_size, NULL, 0, + NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch update_buffer_kernel," + " error\n"); + goto CLEANUP; + } + if (iter != (maxIter - 1)) + { + clCl2VkExternalSemaphore->signal(cmd_queue1); + } + } + error_2 = (uint8_t *)malloc(sizeof(uint8_t)); + if (NULL == error_2) + { + log_error("Not able to allocate memory\n"); + goto CLEANUP; + } + + error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + sizeof(uint8_t), NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clCreateBuffer \n"); + goto CLEANUP; + } + uint8_t val = 0; + err = clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), &val, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clEnqueueWriteBuffer \n"); + goto CLEANUP; + } + + int calc_max_iter = (maxIter * 2); + for (int i = 0; i < vkBufferList.size(); i++) + { + err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem), + (void *)&(buffers[i])); + err |= + clSetKernelArg(verify_kernel, 1, sizeof(int), &bufferSize); + err |= clSetKernelArg(verify_kernel, 2, sizeof(int), + &calc_max_iter); + err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem), + (void *)&error_1); + if (err != CL_SUCCESS) + { + print_error( + err, + "Error: Failed to set arg values for verify_kernel \n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, NULL, + global_work_size, NULL, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error( + err, "Error: Failed to launch verify_kernel, error\n"); + goto CLEANUP; + } + + err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), error_2, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error \n"); + goto CLEANUP; + } + if (*error_2 == 1) + { + log_error("&&&& vulkan_opencl_buffer test FAILED\n"); + goto CLEANUP; + } + } + for (size_t i = 0; i < vkBufferList.size(); i++) + { + delete vkBufferListDeviceMemory[i]; + delete externalMemory[i]; + } + vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(), + vkBufferListDeviceMemory.begin() + + numBuffers); + externalMemory.erase(externalMemory.begin(), + externalMemory.begin() + numBuffers); + } + } +CLEANUP: + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + if (vkBufferListDeviceMemory[i]) + { + delete vkBufferListDeviceMemory[i]; + } + if (externalMemory[i]) + { + delete externalMemory[i]; + } + } + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + if (error_2) free(error_2); + if (error_1) clReleaseMemObject(error_1); + return err; +} + +int run_test_with_multi_import_same_ctx( + cl_context &context, cl_command_queue &cmd_queue1, cl_kernel *kernel, + cl_kernel &verify_kernel, VulkanDevice &vkDevice, uint32_t numBuffers, + uint32_t bufferSize, uint32_t bufferSizeForOffset) +{ + size_t global_work_size[1]; + uint8_t *error_2; + cl_mem error_1; + int numImports = numBuffers; + cl_kernel update_buffer_kernel[MAX_IMPORTS]; + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + int err = CL_SUCCESS; + int calc_max_iter; + bool withOffset; + uint32_t pBufferSize; + + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + + VulkanQueue &vkQueue = vkDevice.getQueue(); + + std::vector<char> vkBufferShader = readFile("buffer.spv"); + + VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader); + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkBufferShaderModule); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + const uint32_t maxIter = innerIterations; + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool); + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory; + std::vector<std::vector<clExternalMemory *>> externalMemory; + + + for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type: %d\n", + vkExternalMemoryHandleType); + + VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, + vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyBuffer.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + const VulkanMemoryType &memoryType = memoryTypeList[mtIdx]; + + log_info("Memory type index: %d\n", (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + for (unsigned int withOffset = 0; + withOffset <= (unsigned int)enableOffset; withOffset++) + { + log_info("Running withOffset case %d\n", (uint32_t)withOffset); + if (withOffset) + { + pBufferSize = bufferSizeForOffset; + } + else + { + pBufferSize = bufferSize; + } + cl_mem buffers[MAX_BUFFERS][MAX_IMPORTS]; + VulkanBufferList vkBufferList(numBuffers, vkDevice, pBufferSize, + vkExternalMemoryHandleType); + uint32_t interBufferOffset = + (uint32_t)(vkBufferList[0].getSize()); + + for (size_t bIdx = 0; bIdx < numBuffers; bIdx++) + { + if (withOffset == 0) + { + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, pBufferSize, + memoryType, + vkExternalMemoryHandleType)); + } + if (withOffset == 1) + { + uint32_t totalSize = + (uint32_t)(vkBufferList.size() * interBufferOffset); + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, totalSize, + memoryType, + vkExternalMemoryHandleType)); + } + std::vector<clExternalMemory *> pExternalMemory; + for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++) + { + pExternalMemory.push_back(new clExternalMemory( + vkBufferListDeviceMemory[bIdx], + vkExternalMemoryHandleType, + withOffset * bIdx * interBufferOffset, pBufferSize, + context, deviceId)); + } + externalMemory.push_back(pExternalMemory); + } + + clFinish(cmd_queue1); + Params *params = (Params *)vkParamsDeviceMemory.map(); + params->numBuffers = numBuffers; + params->bufferSize = pBufferSize; + params->interBufferOffset = interBufferOffset * withOffset; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++) + { + size_t buffer_size = vkBufferList[bIdx].getSize(); + vkBufferListDeviceMemory[bIdx]->bindBuffer( + vkBufferList[bIdx], + bIdx * interBufferOffset * withOffset); + for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++) + { + buffers[bIdx][cl_bIdx] = + externalMemory[bIdx][cl_bIdx] + ->getExternalMemoryBuffer(); + } + vkDescriptorSet.update((uint32_t)bIdx + 1, + vkBufferList[bIdx]); + } + vkCommandBuffer.begin(); + vkCommandBuffer.bindPipeline(vkComputePipeline); + vkCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, vkDescriptorSet); + vkCommandBuffer.dispatch(512, 1, 1); + vkCommandBuffer.end(); + for (int i = 0; i < numImports; i++) + { + update_buffer_kernel[i] = (numBuffers == 1) + ? kernel[0] + : ((numBuffers == 2) ? kernel[1] : kernel[2]); + } + // global work size should be less than or equal to + // bufferSizeList[i] + global_work_size[0] = pBufferSize; + + for (uint32_t iter = 0; iter < maxIter; iter++) + { + if (iter == 0) + { + vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore); + } + else + { + vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer, + vkVk2CLSemaphore); + } + clVk2CLExternalSemaphore->wait(cmd_queue1); + for (uint8_t launchIter = 0; launchIter < numImports; + launchIter++) + { + err = clSetKernelArg(update_buffer_kernel[launchIter], + 0, sizeof(uint32_t), + (void *)&pBufferSize); + for (int i = 0; i < numBuffers; i++) + { + err |= clSetKernelArg( + update_buffer_kernel[launchIter], i + 1, + sizeof(cl_mem), + (void *)&(buffers[i][launchIter])); + } + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "kernel\n "); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel( + cmd_queue1, update_buffer_kernel[launchIter], 1, + NULL, global_work_size, NULL, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch " + "update_buffer_kernel, error\n "); + goto CLEANUP; + } + } + if (iter != (maxIter - 1)) + { + clCl2VkExternalSemaphore->signal(cmd_queue1); + } + } + error_2 = (uint8_t *)malloc(sizeof(uint8_t)); + if (NULL == error_2) + { + log_error("Not able to allocate memory\n"); + goto CLEANUP; + } + + error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + sizeof(uint8_t), NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clCreateBuffer \n"); + goto CLEANUP; + } + uint8_t val = 0; + err = + clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), &val, 0, NULL, NULL); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clEnqueueWriteBuffer \n"); + goto CLEANUP; + } + calc_max_iter = maxIter * (numBuffers + 1); + + for (int i = 0; i < vkBufferList.size(); i++) + { + err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem), + (void *)&(buffers[i][0])); + err |= clSetKernelArg(verify_kernel, 1, sizeof(int), + &pBufferSize); + err |= clSetKernelArg(verify_kernel, 2, sizeof(int), + &calc_max_iter); + err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem), + (void *)&error_1); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "verify_kernel \n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, + NULL, global_work_size, NULL, + 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error( + err, + "Error: Failed to launch verify_kernel, error\n"); + goto CLEANUP; + } + + err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), error_2, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error \n"); + goto CLEANUP; + } + if (*error_2 == 1) + { + log_error("&&&& vulkan_opencl_buffer test FAILED\n"); + goto CLEANUP; + } + } + for (size_t i = 0; i < vkBufferList.size(); i++) + { + for (size_t j = 0; j < numImports; j++) + { + delete externalMemory[i][j]; + } + } + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + delete vkBufferListDeviceMemory[i]; + } + vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(), + vkBufferListDeviceMemory.end()); + for (size_t i = 0; i < externalMemory.size(); i++) + { + externalMemory[i].erase(externalMemory[i].begin(), + externalMemory[i].begin() + + numBuffers); + } + externalMemory.clear(); + } + } + } +CLEANUP: + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + if (vkBufferListDeviceMemory[i]) + { + delete vkBufferListDeviceMemory[i]; + } + } + for (size_t i = 0; i < externalMemory.size(); i++) + { + for (size_t j = 0; j < externalMemory[i].size(); j++) + { + if (externalMemory[i][j]) + { + delete externalMemory[i][j]; + } + } + } + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + if (error_2) free(error_2); + if (error_1) clReleaseMemObject(error_1); + return err; +} + +int run_test_with_multi_import_diff_ctx( + cl_context &context, cl_context &context2, cl_command_queue &cmd_queue1, + cl_command_queue &cmd_queue2, cl_kernel *kernel1, cl_kernel *kernel2, + cl_kernel &verify_kernel, cl_kernel verify_kernel2, VulkanDevice &vkDevice, + uint32_t numBuffers, uint32_t bufferSize, uint32_t bufferSizeForOffset) +{ + size_t global_work_size[1]; + uint8_t *error_3; + cl_mem error_1; + cl_mem error_2; + int numImports = numBuffers; + cl_kernel update_buffer_kernel1[MAX_IMPORTS]; + cl_kernel update_buffer_kernel2[MAX_IMPORTS]; + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + clExternalSemaphore *clVk2CLExternalSemaphore2 = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore2 = NULL; + int err = CL_SUCCESS; + int calc_max_iter; + bool withOffset; + uint32_t pBufferSize; + + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + + VulkanQueue &vkQueue = vkDevice.getQueue(); + + std::vector<char> vkBufferShader = readFile("buffer.spv"); + + VulkanShaderModule vkBufferShaderModule(vkDevice, vkBufferShader); + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + MAX_BUFFERS + 1, VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkBufferShaderModule); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + + clVk2CLExternalSemaphore2 = new clExternalSemaphore( + vkVk2CLSemaphore, context2, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore2 = new clExternalSemaphore( + vkCl2VkSemaphore, context2, vkExternalSemaphoreHandleType, deviceId); + + const uint32_t maxIter = innerIterations; + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCommandBuffer(vkDevice, vkCommandPool); + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + std::vector<VulkanDeviceMemory *> vkBufferListDeviceMemory; + std::vector<std::vector<clExternalMemory *>> externalMemory1; + std::vector<std::vector<clExternalMemory *>> externalMemory2; + + for (size_t emhtIdx = 0; emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type:%d\n", + vkExternalMemoryHandleType); + + VulkanBuffer vkDummyBuffer(vkDevice, 4 * 1024, + vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyBuffer.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + const VulkanMemoryType &memoryType = memoryTypeList[mtIdx]; + + log_info("Memory type index: %d\n", (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + + for (unsigned int withOffset = 0; + withOffset <= (unsigned int)enableOffset; withOffset++) + { + log_info("Running withOffset case %d\n", (uint32_t)withOffset); + cl_mem buffers1[MAX_BUFFERS][MAX_IMPORTS]; + cl_mem buffers2[MAX_BUFFERS][MAX_IMPORTS]; + if (withOffset) + { + pBufferSize = bufferSizeForOffset; + } + else + { + pBufferSize = bufferSize; + } + VulkanBufferList vkBufferList(numBuffers, vkDevice, pBufferSize, + vkExternalMemoryHandleType); + uint32_t interBufferOffset = + (uint32_t)(vkBufferList[0].getSize()); + + for (size_t bIdx = 0; bIdx < numBuffers; bIdx++) + { + if (withOffset == 0) + { + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, pBufferSize, + memoryType, + vkExternalMemoryHandleType)); + } + if (withOffset == 1) + { + uint32_t totalSize = + (uint32_t)(vkBufferList.size() * interBufferOffset); + vkBufferListDeviceMemory.push_back( + new VulkanDeviceMemory(vkDevice, totalSize, + memoryType, + vkExternalMemoryHandleType)); + } + std::vector<clExternalMemory *> pExternalMemory1; + std::vector<clExternalMemory *> pExternalMemory2; + for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++) + { + pExternalMemory1.push_back(new clExternalMemory( + vkBufferListDeviceMemory[bIdx], + vkExternalMemoryHandleType, + withOffset * bIdx * interBufferOffset, pBufferSize, + context, deviceId)); + pExternalMemory2.push_back(new clExternalMemory( + vkBufferListDeviceMemory[bIdx], + vkExternalMemoryHandleType, + withOffset * bIdx * interBufferOffset, pBufferSize, + context2, deviceId)); + } + externalMemory1.push_back(pExternalMemory1); + externalMemory2.push_back(pExternalMemory2); + } + + clFinish(cmd_queue1); + Params *params = (Params *)vkParamsDeviceMemory.map(); + params->numBuffers = numBuffers; + params->bufferSize = pBufferSize; + params->interBufferOffset = interBufferOffset * withOffset; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t bIdx = 0; bIdx < vkBufferList.size(); bIdx++) + { + size_t buffer_size = vkBufferList[bIdx].getSize(); + vkBufferListDeviceMemory[bIdx]->bindBuffer( + vkBufferList[bIdx], + bIdx * interBufferOffset * withOffset); + for (size_t cl_bIdx = 0; cl_bIdx < numImports; cl_bIdx++) + { + buffers1[bIdx][cl_bIdx] = + externalMemory1[bIdx][cl_bIdx] + ->getExternalMemoryBuffer(); + buffers2[bIdx][cl_bIdx] = + externalMemory2[bIdx][cl_bIdx] + ->getExternalMemoryBuffer(); + } + vkDescriptorSet.update((uint32_t)bIdx + 1, + vkBufferList[bIdx]); + } + + vkCommandBuffer.begin(); + vkCommandBuffer.bindPipeline(vkComputePipeline); + vkCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, vkDescriptorSet); + vkCommandBuffer.dispatch(512, 1, 1); + vkCommandBuffer.end(); + + for (int i = 0; i < numImports; i++) + { + update_buffer_kernel1[i] = (numBuffers == 1) + ? kernel1[0] + : ((numBuffers == 2) ? kernel1[1] : kernel1[2]); + update_buffer_kernel2[i] = (numBuffers == 1) + ? kernel2[0] + : ((numBuffers == 2) ? kernel2[1] : kernel2[2]); + } + + // global work size should be less than or equal + // to bufferSizeList[i] + global_work_size[0] = pBufferSize; + + for (uint32_t iter = 0; iter < maxIter; iter++) + { + if (iter == 0) + { + vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore); + } + else + { + vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer, + vkVk2CLSemaphore); + } + clVk2CLExternalSemaphore->wait(cmd_queue1); + + for (uint8_t launchIter = 0; launchIter < numImports; + launchIter++) + { + err = clSetKernelArg(update_buffer_kernel1[launchIter], + 0, sizeof(uint32_t), + (void *)&pBufferSize); + for (int i = 0; i < numBuffers; i++) + { + err |= clSetKernelArg( + update_buffer_kernel1[launchIter], i + 1, + sizeof(cl_mem), + (void *)&(buffers1[i][launchIter])); + } + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "kernel\n "); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel( + cmd_queue1, update_buffer_kernel1[launchIter], 1, + NULL, global_work_size, NULL, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch " + "update_buffer_kernel, error\n"); + goto CLEANUP; + } + } + if (iter != (maxIter - 1)) + { + clCl2VkExternalSemaphore->signal(cmd_queue1); + } + } + clFinish(cmd_queue1); + for (uint32_t iter = 0; iter < maxIter; iter++) + { + if (iter == 0) + { + vkQueue.submit(vkCommandBuffer, vkVk2CLSemaphore); + } + else + { + vkQueue.submit(vkCl2VkSemaphore, vkCommandBuffer, + vkVk2CLSemaphore); + } + clVk2CLExternalSemaphore2->wait(cmd_queue2); + + for (uint8_t launchIter = 0; launchIter < numImports; + launchIter++) + { + err = clSetKernelArg(update_buffer_kernel2[launchIter], + 0, sizeof(uint32_t), + (void *)&bufferSize); + for (int i = 0; i < numBuffers; i++) + { + err |= clSetKernelArg( + update_buffer_kernel2[launchIter], i + 1, + sizeof(cl_mem), + (void *)&(buffers2[i][launchIter])); + } + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "kernel\n "); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel( + cmd_queue2, update_buffer_kernel2[launchIter], 1, + NULL, global_work_size, NULL, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch " + "update_buffer_kernel, error\n "); + goto CLEANUP; + } + } + if (iter != (maxIter - 1)) + { + clCl2VkExternalSemaphore2->signal(cmd_queue2); + } + } + clFinish(cmd_queue2); + error_3 = (uint8_t *)malloc(sizeof(uint8_t)); + if (NULL == error_3) + { + log_error("Not able to allocate memory\n"); + goto CLEANUP; + } + + error_1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, + sizeof(uint8_t), NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clCreateBuffer \n"); + goto CLEANUP; + } + error_2 = clCreateBuffer(context2, CL_MEM_WRITE_ONLY, + sizeof(uint8_t), NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "Error: clCreateBuffer \n"); + goto CLEANUP; + } + uint8_t val = 0; + err = + clEnqueueWriteBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), &val, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error \n"); + goto CLEANUP; + } + + err = + clEnqueueWriteBuffer(cmd_queue2, error_2, CL_TRUE, 0, + sizeof(uint8_t), &val, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error \n"); + goto CLEANUP; + } + + calc_max_iter = maxIter * 2 * (numBuffers + 1); + for (int i = 0; i < numBuffers; i++) + { + err = clSetKernelArg(verify_kernel, 0, sizeof(cl_mem), + (void *)&(buffers1[i][0])); + err |= clSetKernelArg(verify_kernel, 1, sizeof(int), + &pBufferSize); + err |= clSetKernelArg(verify_kernel, 2, sizeof(int), + &calc_max_iter); + err |= clSetKernelArg(verify_kernel, 3, sizeof(cl_mem), + (void *)&error_1); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "verify_kernel \n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue1, verify_kernel, 1, + NULL, global_work_size, NULL, + 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch verify_kernel," + "error\n"); + goto CLEANUP; + } + + err = clEnqueueReadBuffer(cmd_queue1, error_1, CL_TRUE, 0, + sizeof(uint8_t), error_3, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error\n"); + goto CLEANUP; + } + if (*error_3 == 1) + { + log_error("&&&& vulkan_opencl_buffer test FAILED\n"); + goto CLEANUP; + } + } + *error_3 = 0; + for (int i = 0; i < vkBufferList.size(); i++) + { + err = clSetKernelArg(verify_kernel2, 0, sizeof(cl_mem), + (void *)&(buffers2[i][0])); + err |= clSetKernelArg(verify_kernel2, 1, sizeof(int), + &pBufferSize); + err |= clSetKernelArg(verify_kernel2, 2, sizeof(int), + &calc_max_iter); + err |= clSetKernelArg(verify_kernel2, 3, sizeof(cl_mem), + (void *)&error_2); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg values for " + "verify_kernel \n"); + goto CLEANUP; + } + err = clEnqueueNDRangeKernel(cmd_queue2, verify_kernel2, 1, + NULL, global_work_size, NULL, + 0, NULL, NULL); + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to launch verify_kernel," + "error\n"); + goto CLEANUP; + } + + err = clEnqueueReadBuffer(cmd_queue2, error_2, CL_TRUE, 0, + sizeof(uint8_t), error_3, 0, NULL, + NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed read output, error\n"); + goto CLEANUP; + } + if (*error_3 == 1) + { + log_error("&&&& vulkan_opencl_buffer test FAILED\n"); + goto CLEANUP; + } + } + for (size_t i = 0; i < vkBufferList.size(); i++) + { + for (size_t j = 0; j < numImports; j++) + { + delete externalMemory1[i][j]; + delete externalMemory2[i][j]; + } + } + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + delete vkBufferListDeviceMemory[i]; + } + vkBufferListDeviceMemory.erase(vkBufferListDeviceMemory.begin(), + vkBufferListDeviceMemory.end()); + for (size_t i = 0; i < externalMemory1.size(); i++) + { + externalMemory1[i].erase(externalMemory1[i].begin(), + externalMemory1[i].begin() + + numBuffers); + externalMemory2[i].erase(externalMemory2[i].begin(), + externalMemory2[i].begin() + + numBuffers); + } + externalMemory1.clear(); + externalMemory2.clear(); + } + } + } +CLEANUP: + for (size_t i = 0; i < vkBufferListDeviceMemory.size(); i++) + { + if (vkBufferListDeviceMemory[i]) + { + delete vkBufferListDeviceMemory[i]; + } + } + for (size_t i = 0; i < externalMemory1.size(); i++) + { + for (size_t j = 0; j < externalMemory1[i].size(); j++) + { + if (externalMemory1[i][j]) + { + delete externalMemory1[i][j]; + } + } + } + for (size_t i = 0; i < externalMemory2.size(); i++) + { + for (size_t j = 0; j < externalMemory2[i].size(); j++) + { + if (externalMemory2[i][j]) + { + delete externalMemory2[i][j]; + } + } + } + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + if (clVk2CLExternalSemaphore2) delete clVk2CLExternalSemaphore2; + if (clCl2VkExternalSemaphore2) delete clCl2VkExternalSemaphore2; + if (error_3) free(error_3); + if (error_1) clReleaseMemObject(error_1); + if (error_2) clReleaseMemObject(error_2); + return err; +} + +int test_buffer_common(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + cl_int errNum = CL_SUCCESS; + cl_platform_id platform = NULL; + size_t extensionSize = 0; + cl_uint num_devices = 0; + cl_uint device_no = 0; + const size_t bufsize = BUFFERSIZE; + char buf[BUFFERSIZE]; + cl_device_id *devices; + char *extensions = NULL; + cl_kernel verify_kernel; + cl_kernel verify_kernel2; + cl_kernel kernel[3] = { NULL, NULL, NULL }; + cl_kernel kernel2[3] = { NULL, NULL, NULL }; + const char *program_source_const[3] = { kernel_text_numbuffer_1, + kernel_text_numbuffer_2, + kernel_text_numbuffer_4 }; + const char *program_source_const_verify; + size_t program_source_length; + cl_command_queue cmd_queue1 = NULL; + cl_command_queue cmd_queue2 = NULL; + cl_command_queue cmd_queue3 = NULL; + cl_context context = NULL; + cl_program program[3] = { NULL, NULL, NULL }; + cl_program program_verify, program_verify2; + cl_context context2 = NULL; + + + VulkanDevice vkDevice; + uint32_t numBuffersList[] = { 1, 2, 4 }; + uint32_t bufferSizeList[] = { 4 * 1024, 64 * 1024, 2 * 1024 * 1024 }; + uint32_t bufferSizeListforOffset[] = { 256, 512, 1024 }; + + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 }; + errNum = clGetPlatformIDs(1, &platform, NULL); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "Error: Failed to get platform\n"); + goto CLEANUP; + } + + errNum = + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "clGetDeviceIDs failed in returning of devices\n"); + goto CLEANUP; + } + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + if (NULL == devices) + { + errNum = CL_OUT_OF_HOST_MEMORY; + print_error(errNum, "Unable to allocate memory for devices\n"); + goto CLEANUP; + } + errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, + NULL); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "Failed to get deviceID.\n"); + goto CLEANUP; + } + contextProperties[1] = (cl_context_properties)platform; + log_info("Assigned contextproperties for platform\n"); + for (device_no = 0; device_no < num_devices; device_no++) + { + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0, + NULL, &extensionSize); + if (CL_SUCCESS != errNum) + { + print_error(errNum, + "Error in clGetDeviceInfo for getting device_extension " + "size....\n"); + goto CLEANUP; + } + extensions = (char *)malloc(extensionSize); + if (NULL == extensions) + { + print_error(errNum, "Unable to allocate memory for extensions\n"); + errNum = CL_OUT_OF_HOST_MEMORY; + goto CLEANUP; + } + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, + extensionSize, extensions, NULL); + if (CL_SUCCESS != errNum) + { + print_error(errNum, + "Error in clGetDeviceInfo for device_extension\n"); + goto CLEANUP; + } + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR, + CL_UUID_SIZE_KHR, uuid, &extensionSize); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "clGetDeviceInfo failed\n"); + goto CLEANUP; + } + errNum = + memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE); + if (errNum == 0) + { + break; + } + } + if (device_no >= num_devices) + { + errNum = EXIT_FAILURE; + print_error(errNum, + "OpenCL error: " + "No Vulkan-OpenCL Interop capable GPU found.\n"); + goto CLEANUP; + } + deviceId = devices[device_no]; + context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &errNum); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "error creating context\n"); + goto CLEANUP; + } + log_info("Successfully created context !!!\n"); + + cmd_queue1 = clCreateCommandQueue(context, devices[device_no], 0, &errNum); + if (CL_SUCCESS != errNum) + { + errNum = CL_INVALID_COMMAND_QUEUE; + print_error(errNum, "Error: Failed to create command queue!\n"); + goto CLEANUP; + } + cmd_queue2 = clCreateCommandQueue(context, devices[device_no], 0, &errNum); + if (CL_SUCCESS != errNum) + { + errNum = CL_INVALID_COMMAND_QUEUE; + print_error(errNum, "Error: Failed to create command queue!\n"); + goto CLEANUP; + } + log_info("clCreateCommandQueue successful\n"); + for (int i = 0; i < 3; i++) + { + program_source_length = strlen(program_source_const[i]); + program[i] = + clCreateProgramWithSource(context, 1, &program_source_const[i], + &program_source_length, &errNum); + errNum = clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "Error: Failed to build program \n"); + return errNum; + } + // create the kernel + kernel[i] = clCreateKernel(program[i], "clUpdateBuffer", &errNum); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "clCreateKernel failed \n"); + return errNum; + } + } + + program_source_const_verify = kernel_text_verify; + program_source_length = strlen(program_source_const_verify); + program_verify = + clCreateProgramWithSource(context, 1, &program_source_const_verify, + &program_source_length, &errNum); + errNum = clBuildProgram(program_verify, 0, NULL, NULL, NULL, NULL); + if (errNum != CL_SUCCESS) + { + log_error("Error: Failed to build program2\n"); + return errNum; + } + verify_kernel = clCreateKernel(program_verify, "checkKernel", &errNum); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "clCreateKernel failed \n"); + return errNum; + } + + if (multiCtx) // different context guard + { + context2 = clCreateContextFromType( + contextProperties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errNum); + if (CL_SUCCESS != errNum) + { + print_error(errNum, "error creating context\n"); + goto CLEANUP; + } + cmd_queue3 = + clCreateCommandQueue(context2, devices[device_no], 0, &errNum); + if (CL_SUCCESS != errNum) + { + errNum = CL_INVALID_COMMAND_QUEUE; + print_error(errNum, "Error: Failed to create command queue!\n"); + goto CLEANUP; + } + for (int i = 0; i < 3; i++) + { + program_source_length = strlen(program_source_const[i]); + program[i] = + clCreateProgramWithSource(context2, 1, &program_source_const[i], + &program_source_length, &errNum); + errNum = clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "Error: Failed to build program \n"); + return errNum; + } + // create the kernel + kernel2[i] = clCreateKernel(program[i], "clUpdateBuffer", &errNum); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "clCreateKernel failed \n"); + return errNum; + } + } + program_source_length = strlen(program_source_const_verify); + program_verify = + clCreateProgramWithSource(context2, 1, &program_source_const_verify, + &program_source_length, &errNum); + errNum = clBuildProgram(program_verify, 0, NULL, NULL, NULL, NULL); + if (errNum != CL_SUCCESS) + { + log_error("Error: Failed to build program2\n"); + return errNum; + } + verify_kernel2 = clCreateKernel(program_verify, "checkKernel", &errNum); + if (errNum != CL_SUCCESS) + { + print_error(errNum, "clCreateKernel failed \n"); + return errNum; + } + } + + for (size_t numBuffersIdx = 0; numBuffersIdx < ARRAY_SIZE(numBuffersList); + numBuffersIdx++) + { + uint32_t numBuffers = numBuffersList[numBuffersIdx]; + log_info("Number of buffers: %d\n", numBuffers); + for (size_t sizeIdx = 0; sizeIdx < ARRAY_SIZE(bufferSizeList); + sizeIdx++) + { + uint32_t bufferSize = bufferSizeList[sizeIdx]; + uint32_t bufferSizeForOffset = bufferSizeListforOffset[sizeIdx]; + log_info("&&&& RUNNING vulkan_opencl_buffer test for Buffer size: " + "%d\n", + bufferSize); + if (multiImport && !multiCtx) + { + errNum = run_test_with_multi_import_same_ctx( + context, cmd_queue1, kernel, verify_kernel, vkDevice, + numBuffers, bufferSize, bufferSizeForOffset); + } + else if (multiImport && multiCtx) + { + errNum = run_test_with_multi_import_diff_ctx( + context, context2, cmd_queue1, cmd_queue3, kernel, kernel2, + verify_kernel, verify_kernel2, vkDevice, numBuffers, + bufferSize, bufferSizeForOffset); + } + else if (numCQ == 2) + { + errNum = run_test_with_two_queue( + context, cmd_queue1, cmd_queue2, kernel, verify_kernel, + vkDevice, numBuffers + 1, bufferSize); + } + else + { + errNum = run_test_with_one_queue(context, cmd_queue1, kernel, + verify_kernel, vkDevice, + numBuffers, bufferSize); + } + if (errNum != CL_SUCCESS) + { + print_error(errNum, "func_name failed \n"); + goto CLEANUP; + } + } + } + +CLEANUP: + for (int i = 0; i < 3; i++) + { + if (program[i]) clReleaseProgram(program[i]); + if (kernel[i]) clReleaseKernel(kernel[i]); + } + if (cmd_queue1) clReleaseCommandQueue(cmd_queue1); + if (cmd_queue2) clReleaseCommandQueue(cmd_queue2); + if (cmd_queue3) clReleaseCommandQueue(cmd_queue3); + if (context) clReleaseContext(context); + if (context2) clReleaseContext(context2); + + if (devices) free(devices); + if (extensions) free(extensions); + + return errNum; +} diff --git a/test_conformance/vulkan/test_vulkan_interop_image.cpp b/test_conformance/vulkan/test_vulkan_interop_image.cpp new file mode 100644 index 00000000..7577de09 --- /dev/null +++ b/test_conformance/vulkan/test_vulkan_interop_image.cpp @@ -0,0 +1,1596 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#define NOMINMAX +#include <vulkan_interop_common.hpp> +#include <string> +#include "harness/errorHelpers.h" + +#define MAX_2D_IMAGES 5 +#define MAX_2D_IMAGE_WIDTH 1024 +#define MAX_2D_IMAGE_HEIGHT 1024 +#define MAX_2D_IMAGE_ELEMENT_SIZE 16 +#define MAX_2D_IMAGE_MIP_LEVELS 11 +#define MAX_2D_IMAGE_DESCRIPTORS MAX_2D_IMAGES *MAX_2D_IMAGE_MIP_LEVELS +#define NUM_THREADS_PER_GROUP_X 32 +#define NUM_THREADS_PER_GROUP_Y 32 +#define NUM_BLOCKS(size, blockSize) \ + (ROUND_UP((size), (blockSize)) / (blockSize)) + +#define ASSERT(x) \ + if (!(x)) \ + { \ + fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, \ + __LINE__); \ + exit(1); \ + } + +#define ASSERT_LEQ(x, y) \ + if (x > y) \ + { \ + ASSERT(0); \ + } + +namespace { +struct Params +{ + uint32_t numImage2DDescriptors; +}; +} +static cl_uchar uuid[CL_UUID_SIZE_KHR]; +static cl_device_id deviceId = NULL; +size_t max_width = MAX_2D_IMAGE_WIDTH; +size_t max_height = MAX_2D_IMAGE_HEIGHT; + +const char *kernel_text_numImage_1 = " \ +__constant sampler_t smpImg = CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST;\n\ +__kernel void image2DKernel(read_only image2d_t InputImage, write_only image2d_t OutImage, int num2DImages, int baseWidth, int baseHeight, int numMipLevels)\n\ +{\n\ + int threadIdxX = get_global_id(0);\n\ + int threadIdxY = get_global_id(1);\n\ + int numThreadsX = get_global_size(0); \n\ + int numThreadsY = get_global_size(1);\n\ + if (threadIdxX >= baseWidth || threadIdxY >= baseHeight)\n\ + {\n\ + return;\n\ + }\n\ + %s dataA = read_image%s(InputImage, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataB = read_image%s(InputImage, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + write_image%s(OutImage, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\ + write_image%s(OutImage, (int2)( threadIdxX, threadIdxY), dataB);\n\ +\n\ +}"; + +const char *kernel_text_numImage_2 = " \ +__constant sampler_t smpImg = CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST;\n\ +__kernel void image2DKernel(read_only image2d_t InputImage_1, write_only image2d_t OutImage_1, read_only image2d_t InputImage_2,write_only image2d_t OutImage_2,int num2DImages, int baseWidth, int baseHeight, int numMipLevels) \n\ +{\n\ + int threadIdxX = get_global_id(0);\n\ + int threadIdxY = get_global_id(1);\n\ + int numThreadsX = get_global_size(0);\n\ + int numThreadsY = get_global_size(1);\n\ + if (threadIdxX >= baseWidth || threadIdxY >= baseHeight) \n\ + {\n\ + return;\n\ + }\n\ + %s dataA = read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataB = read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + %s dataC = read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataD = read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + write_image%s(OutImage_1, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\ + write_image%s(OutImage_1, (int2)(threadIdxX, threadIdxY), dataB);\n\ + write_image%s(OutImage_2, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataC);\n\ + write_image%s(OutImage_2, (int2)(threadIdxX, threadIdxY), dataD);\n\ +\n\ +}"; + +const char *kernel_text_numImage_4 = " \ +__constant sampler_t smpImg = CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST;\n\ +__kernel void image2DKernel(read_only image2d_t InputImage_1, write_only image2d_t OutImage_1, read_only image2d_t InputImage_2, write_only image2d_t OutImage_2, read_only image2d_t InputImage_3, write_only image2d_t OutImage_3, read_only image2d_t InputImage_4, write_only image2d_t OutImage_4, int num2DImages, int baseWidth, int baseHeight, int numMipLevels) \n\ +{\n\ + int threadIdxX = get_global_id(0);\n\ + int threadIdxY = get_global_id(1);\n\ + int numThreadsX = get_global_size(0);\n\ + int numThreadsY = get_global_size(1);\n\ + if (threadIdxX >= baseWidth || threadIdxY >= baseHeight) \n\ + {\n\ + return;\n\ + }\n\ + %s dataA = read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataB = read_image%s(InputImage_1, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + %s dataC = read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataD = read_image%s(InputImage_2, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + %s dataE = read_image%s(InputImage_3, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataF = read_image%s(InputImage_3, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + %s dataG = read_image%s(InputImage_4, smpImg, (int2)(threadIdxX, threadIdxY)); \n\ + %s dataH = read_image%s(InputImage_4, smpImg, (int2)(threadIdxX, baseHeight-threadIdxY-1)); \n\ + write_image%s(OutImage_1, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataA);\n\ + write_image%s(OutImage_1, (int2)(threadIdxX, threadIdxY), dataB);\n\ + write_image%s(OutImage_2, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataC);\n\ + write_image%s(OutImage_2, (int2)(threadIdxX, threadIdxY), dataD);\n\ + write_image%s(OutImage_3, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataE);\n\ + write_image%s(OutImage_3, (int2)(threadIdxX, threadIdxY), dataF);\n\ + write_image%s(OutImage_4, (int2)(threadIdxX, baseHeight-threadIdxY-1), dataG);\n\ + write_image%s(OutImage_4, (int2)(threadIdxX, threadIdxY), dataH);\n\ +\n\ +}"; + +const uint32_t num2DImagesList[] = { 1, 2, 4 }; +const uint32_t widthList[] = { 4, 64, 183, 1024 }; +const uint32_t heightList[] = { 4, 64, 365 }; + +const cl_kernel getKernelType(VulkanFormat format, cl_kernel kernel_float, + cl_kernel kernel_signed, + cl_kernel kernel_unsigned) +{ + cl_kernel kernel; + switch (format) + { + case VULKAN_FORMAT_R32G32B32A32_SFLOAT: kernel = kernel_float; break; + + case VULKAN_FORMAT_R32G32B32A32_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R32G32B32A32_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R16G16B16A16_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R16G16B16A16_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R8G8B8A8_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R8G8B8A8_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R32G32_SFLOAT: kernel = kernel_float; break; + + case VULKAN_FORMAT_R32G32_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R32G32_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R16G16_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R16G16_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R8G8_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R8G8_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R32_SFLOAT: kernel = kernel_float; break; + + case VULKAN_FORMAT_R32_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R32_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R16_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R16_SINT: kernel = kernel_signed; break; + + case VULKAN_FORMAT_R8_UINT: kernel = kernel_unsigned; break; + + case VULKAN_FORMAT_R8_SINT: kernel = kernel_signed; break; + + default: + log_error(" Unsupported format"); + ASSERT(0); + break; + } + return kernel; +} + +int run_test_with_two_queue(cl_context &context, cl_command_queue &cmd_queue1, + cl_command_queue &cmd_queue2, + cl_kernel *kernel_unsigned, + cl_kernel *kernel_signed, cl_kernel *kernel_float, + VulkanDevice &vkDevice) +{ + cl_int err = CL_SUCCESS; + size_t origin[3] = { 0, 0, 0 }; + size_t region[3] = { 1, 1, 1 }; + + cl_kernel updateKernelCQ1, updateKernelCQ2; + std::vector<VulkanFormat> vkFormatList = getSupportedVulkanFormatList(); + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + char magicValue = 0; + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + + uint64_t maxImage2DSize = + max_width * max_height * MAX_2D_IMAGE_ELEMENT_SIZE * 2; + VulkanBuffer vkSrcBuffer(vkDevice, maxImage2DSize); + VulkanDeviceMemory vkSrcBufferDeviceMemory( + vkDevice, vkSrcBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkSrcBufferDeviceMemory.bindBuffer(vkSrcBuffer); + + char *srcBufferPtr, *dstBufferPtr; + srcBufferPtr = (char *)malloc(maxImage2DSize); + dstBufferPtr = (char *)malloc(maxImage2DSize); + + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, + VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCopyCommandBuffer(vkDevice, vkCommandPool); + VulkanCommandBuffer vkShaderCommandBuffer(vkDevice, vkCommandPool); + VulkanQueue &vkQueue = vkDevice.getQueue(); + + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + + std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1; + std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2; + std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1; + std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2; + std::vector<char> vkImage2DShader; + + for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++) + { + VulkanFormat vkFormat = vkFormatList[fIdx]; + log_info("Format: %d\n", vkFormat); + uint32_t elementSize = getVulkanFormatElementSize(vkFormat); + ASSERT_LEQ(elementSize, (uint32_t)MAX_2D_IMAGE_ELEMENT_SIZE); + log_info("elementSize= %d\n", elementSize); + + std::string fileName = "image2D_" + + std::string(getVulkanFormatGLSLFormat(vkFormat)) + ".spv"; + log_info("Load %s file", fileName.c_str()); + vkImage2DShader = readFile(fileName); + VulkanShaderModule vkImage2DShaderModule(vkDevice, vkImage2DShader); + + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkImage2DShaderModule); + + for (size_t wIdx = 0; wIdx < ARRAY_SIZE(widthList); wIdx++) + { + uint32_t width = widthList[wIdx]; + log_info("Width: %d\n", width); + if (width > max_width) continue; + region[0] = width; + for (size_t hIdx = 0; hIdx < ARRAY_SIZE(heightList); hIdx++) + { + uint32_t height = heightList[hIdx]; + log_info("Height: %d", height); + if (height > max_height) continue; + region[1] = height; + + uint32_t numMipLevels = 1; + log_info("Number of mipmap levels: %d\n", numMipLevels); + + magicValue++; + char *vkSrcBufferDeviceMemoryPtr = + (char *)vkSrcBufferDeviceMemory.map(); + uint64_t srcBufSize = 0; + memset(vkSrcBufferDeviceMemoryPtr, 0, maxImage2DSize); + memset(srcBufferPtr, 0, maxImage2DSize); + uint32_t mipLevel = 0; + for (uint32_t row = 0; + row < std::max(height >> mipLevel, uint32_t(1)); row++) + { + for (uint32_t col = 0; + col < std::max(width >> mipLevel, uint32_t(1)); col++) + { + for (uint32_t elementByte = 0; + elementByte < elementSize; elementByte++) + { + vkSrcBufferDeviceMemoryPtr[srcBufSize] = + (char)(magicValue + mipLevel + row + col); + srcBufferPtr[srcBufSize] = + (char)(magicValue + mipLevel + row + col); + srcBufSize++; + } + } + } + srcBufSize = ROUND_UP( + srcBufSize, + std::max( + elementSize, + (uint32_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT)); + vkSrcBufferDeviceMemory.unmap(); + + for (size_t niIdx = 0; niIdx < ARRAY_SIZE(num2DImagesList); + niIdx++) + { + uint32_t num2DImages = num2DImagesList[niIdx] + 1; + // added one image for cross-cq case for updateKernelCQ2 + log_info("Number of images: %d\n", num2DImages); + ASSERT_LEQ(num2DImages, (uint32_t)MAX_2D_IMAGES); + uint32_t num_2D_image; + if (useSingleImageKernel) + { + num_2D_image = 1; + } + else + { + num_2D_image = num2DImages; + } + Params *params = (Params *)vkParamsDeviceMemory.map(); + params->numImage2DDescriptors = num_2D_image * numMipLevels; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t emhtIdx = 0; + emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType + vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type: %d \n", + vkExternalMemoryHandleType); + if ((true == disableNTHandleType) + && (VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT + == vkExternalMemoryHandleType)) + { + // Skip running for WIN32 NT handle. + continue; + } + VulkanImage2D vkDummyImage2D( + vkDevice, vkFormatList[0], widthList[0], + heightList[0], 1, vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyImage2D.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); + mtIdx++) + { + const VulkanMemoryType &memoryType = + memoryTypeList[mtIdx]; + log_info("Memory type index: %d\n", + (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + if (!useDeviceLocal) + { + if (VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL + == memoryType.getMemoryTypeProperty()) + { + continue; + } + } + + size_t totalImageMemSize = 0; + uint64_t interImageOffset = 0; + { + VulkanImage2D vkImage2D( + vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize); + totalImageMemSize = + ROUND_UP(vkImage2D.getSize(), + vkImage2D.getAlignment()); + } + VulkanImage2DList vkNonDedicatedImage2DList( + num2DImages, vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + for (size_t bIdx = 0; bIdx < num2DImages; bIdx++) + { + if (non_dedicated) + { + vkNonDedicatedImage2DListDeviceMemory1 + .push_back(new VulkanDeviceMemory( + vkDevice, totalImageMemSize, + memoryType, + vkExternalMemoryHandleType)); + } + else + { + vkNonDedicatedImage2DListDeviceMemory1 + .push_back(new VulkanDeviceMemory( + vkDevice, + vkNonDedicatedImage2DList[bIdx], + memoryType, + vkExternalMemoryHandleType)); + } + vkNonDedicatedImage2DListDeviceMemory1[bIdx] + ->bindImage(vkNonDedicatedImage2DList[bIdx], + 0); + nonDedicatedExternalMemory1.push_back( + new clExternalMemoryImage( + *vkNonDedicatedImage2DListDeviceMemory1 + [bIdx], + vkExternalMemoryHandleType, context, + totalImageMemSize, width, height, 0, + vkNonDedicatedImage2DList[bIdx], + deviceId)); + } + VulkanImageViewList vkNonDedicatedImage2DViewList( + vkDevice, vkNonDedicatedImage2DList); + VulkanImage2DList vkNonDedicatedImage2DList2( + num2DImages, vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + for (size_t bIdx = 0; bIdx < num2DImages; bIdx++) + { + if (non_dedicated) + { + vkNonDedicatedImage2DListDeviceMemory2 + .push_back(new VulkanDeviceMemory( + vkDevice, totalImageMemSize, + memoryType, + vkExternalMemoryHandleType)); + } + else + { + vkNonDedicatedImage2DListDeviceMemory2 + .push_back(new VulkanDeviceMemory( + vkDevice, + vkNonDedicatedImage2DList2[bIdx], + memoryType, + vkExternalMemoryHandleType)); + } + vkNonDedicatedImage2DListDeviceMemory2[bIdx] + ->bindImage( + vkNonDedicatedImage2DList2[bIdx], 0); + nonDedicatedExternalMemory2.push_back( + new clExternalMemoryImage( + *vkNonDedicatedImage2DListDeviceMemory2 + [bIdx], + vkExternalMemoryHandleType, context, + totalImageMemSize, width, height, 0, + vkNonDedicatedImage2DList2[bIdx], + deviceId)); + } + VulkanImageViewList vkDedicatedImage2DViewList( + vkDevice, vkNonDedicatedImage2DList2); + + cl_mem external_mem_image1[5]; + cl_mem external_mem_image2[5]; + for (int i = 0; i < num2DImages; i++) + { + external_mem_image1[i] = + nonDedicatedExternalMemory1[i] + ->getExternalMemoryImage(); + external_mem_image2[i] = + nonDedicatedExternalMemory2[i] + ->getExternalMemoryImage(); + } + VulkanImage2DList &vkImage2DList = + vkNonDedicatedImage2DList; + VulkanImageViewList &vkImage2DViewList = + vkNonDedicatedImage2DViewList; + + clCl2VkExternalSemaphore->signal(cmd_queue1); + if (!useSingleImageKernel) + { + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); i2DIdx++) + { + for (uint32_t mipLevel = 0; + mipLevel < numMipLevels; mipLevel++) + { + uint32_t i2DvIdx = + (uint32_t)(i2DIdx * numMipLevels) + + mipLevel; + vkDescriptorSet.update( + 1 + i2DvIdx, + vkImage2DViewList[i2DvIdx]); + } + } + vkCopyCommandBuffer.begin(); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_UNDEFINED, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); i2DIdx++) + { + vkCopyCommandBuffer.copyBufferToImage( + vkSrcBuffer, vkImage2DList[i2DIdx], + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VULKAN_IMAGE_LAYOUT_GENERAL); + vkCopyCommandBuffer.end(); + memset(dstBufferPtr, 0, srcBufSize); + vkQueue.submit(vkCopyCommandBuffer); + vkShaderCommandBuffer.begin(); + vkShaderCommandBuffer.bindPipeline( + vkComputePipeline); + vkShaderCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, + vkDescriptorSet); + vkShaderCommandBuffer.dispatch( + NUM_BLOCKS(width, NUM_THREADS_PER_GROUP_X), + NUM_BLOCKS(height, + NUM_THREADS_PER_GROUP_Y / 2), + 1); + vkShaderCommandBuffer.end(); + } + for (uint32_t iter = 0; iter < innerIterations; + iter++) + { + if (useSingleImageKernel) + { + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); + i2DIdx++) + { + vkDescriptorSet.update( + 1, vkImage2DViewList[i2DIdx]); + vkCopyCommandBuffer.begin(); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_UNDEFINED, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + vkCopyCommandBuffer.copyBufferToImage( + vkSrcBuffer, vkImage2DList[i2DIdx], + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VULKAN_IMAGE_LAYOUT_GENERAL); + vkCopyCommandBuffer.end(); + memset(dstBufferPtr, 0, srcBufSize); + vkQueue.submit(vkCopyCommandBuffer); + vkShaderCommandBuffer.begin(); + vkShaderCommandBuffer.bindPipeline( + vkComputePipeline); + vkShaderCommandBuffer + .bindDescriptorSets( + vkComputePipeline, + vkPipelineLayout, + vkDescriptorSet); + vkShaderCommandBuffer.dispatch( + NUM_BLOCKS(width, + NUM_THREADS_PER_GROUP_X), + NUM_BLOCKS(height, + NUM_THREADS_PER_GROUP_Y + / 2), + 1); + vkShaderCommandBuffer.end(); + if (i2DIdx < vkImage2DList.size() - 1) + { + vkQueue.submit( + vkShaderCommandBuffer); + } + } + } + vkQueue.submit(vkCl2VkSemaphore, + vkShaderCommandBuffer, + vkVk2CLSemaphore); + clVk2CLExternalSemaphore->wait(cmd_queue1); + switch (num2DImages) + { + case 2: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[0], + kernel_signed[0], + kernel_unsigned[0]); + break; + case 3: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[1], + kernel_signed[1], + kernel_unsigned[1]); + break; + case 5: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[2], + kernel_signed[2], + kernel_unsigned[2]); + break; + } + updateKernelCQ2 = getKernelType( + vkFormat, kernel_float[3], kernel_signed[3], + kernel_unsigned[3]); + // similar kernel-type based on vkFormat + int j = 0; + // Setting arguments of updateKernelCQ2 + + err = clSetKernelArg(updateKernelCQ2, 0, + sizeof(cl_mem), + &external_mem_image1[0]); + err |= clSetKernelArg(updateKernelCQ2, 1, + sizeof(cl_mem), + &external_mem_image2[0]); + err |= clSetKernelArg( + updateKernelCQ2, 2, sizeof(cl_mem), + &external_mem_image1[num2DImages - 1]); + err |= clSetKernelArg( + updateKernelCQ2, 3, sizeof(cl_mem), + &external_mem_image2[num2DImages - 1]); + err |= clSetKernelArg(updateKernelCQ2, 4, + sizeof(unsigned int), + &num2DImages); + err |= clSetKernelArg(updateKernelCQ2, 5, + sizeof(unsigned int), + &width); + err |= clSetKernelArg(updateKernelCQ2, 6, + sizeof(unsigned int), + &height); + err |= clSetKernelArg(updateKernelCQ2, 7, + sizeof(unsigned int), + &numMipLevels); + for (int i = 0; i < num2DImages - 1; i++, ++j) + { + err = clSetKernelArg( + updateKernelCQ1, j, sizeof(cl_mem), + &external_mem_image1[i]); + err |= clSetKernelArg( + updateKernelCQ1, ++j, sizeof(cl_mem), + &external_mem_image2[i]); + } + err |= clSetKernelArg(updateKernelCQ1, j, + sizeof(unsigned int), + &num2DImages); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &width); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &height); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &numMipLevels); + + if (err != CL_SUCCESS) + { + print_error( + err, + "Error: Failed to set arg values \n"); + goto CLEANUP; + } + // clVk2CLExternalSemaphore->wait(cmd_queue1); + size_t global_work_size[3] = { width, height, + 1 }; + cl_event first_launch; + err = clEnqueueNDRangeKernel( + cmd_queue1, updateKernelCQ1, 2, NULL, + global_work_size, NULL, 0, NULL, + &first_launch); + if (err != CL_SUCCESS) + { + goto CLEANUP; + } + err = clEnqueueNDRangeKernel( + cmd_queue2, updateKernelCQ2, 2, NULL, + global_work_size, NULL, 1, &first_launch, + NULL); + if (err != CL_SUCCESS) + { + goto CLEANUP; + } + + clFinish(cmd_queue2); + clCl2VkExternalSemaphore->signal(cmd_queue2); + } + + unsigned int flags = 0; + size_t mipmapLevelOffset = 0; + cl_event eventReadImage = NULL; + clFinish(cmd_queue2); + for (int i = 0; i < num2DImages; i++) + { + err = clEnqueueReadImage( + cmd_queue1, external_mem_image2[i], CL_TRUE, + origin, region, 0, 0, dstBufferPtr, 0, NULL, + &eventReadImage); + + if (err != CL_SUCCESS) + { + print_error(err, + "clEnqueueReadImage failed with" + "error\n"); + } + + if (memcmp(srcBufferPtr, dstBufferPtr, + srcBufSize)) + { + log_info("Source and destination buffers " + "don't match\n"); + if (debug_trace) + { + log_info("Source buffer contents: \n"); + for (uint64_t sIdx = 0; + sIdx < srcBufSize; sIdx++) + { + log_info( + "%d ", + (int)vkSrcBufferDeviceMemoryPtr + [sIdx]); + } + log_info("Destination buffer contents:" + "\n"); + for (uint64_t dIdx = 0; + dIdx < srcBufSize; dIdx++) + { + log_info("%d ", + (int)dstBufferPtr[dIdx]); + } + } + err = -1; + break; + } + } + for (int i = 0; i < num2DImages; i++) + { + delete vkNonDedicatedImage2DListDeviceMemory1 + [i]; + delete vkNonDedicatedImage2DListDeviceMemory2 + [i]; + delete nonDedicatedExternalMemory1[i]; + delete nonDedicatedExternalMemory2[i]; + } + vkNonDedicatedImage2DListDeviceMemory1.erase( + vkNonDedicatedImage2DListDeviceMemory1.begin(), + vkNonDedicatedImage2DListDeviceMemory1.begin() + + num2DImages); + vkNonDedicatedImage2DListDeviceMemory2.erase( + vkNonDedicatedImage2DListDeviceMemory2.begin(), + vkNonDedicatedImage2DListDeviceMemory2.begin() + + num2DImages); + nonDedicatedExternalMemory1.erase( + nonDedicatedExternalMemory1.begin(), + nonDedicatedExternalMemory1.begin() + + num2DImages); + nonDedicatedExternalMemory2.erase( + nonDedicatedExternalMemory2.begin(), + nonDedicatedExternalMemory2.begin() + + num2DImages); + if (CL_SUCCESS != err) + { + goto CLEANUP; + } + } + } + } + } + } + + vkImage2DShader.clear(); + } +CLEANUP: + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + + if (srcBufferPtr) free(srcBufferPtr); + if (dstBufferPtr) free(dstBufferPtr); + return err; +} + +int run_test_with_one_queue(cl_context &context, cl_command_queue &cmd_queue1, + cl_kernel *kernel_unsigned, + cl_kernel *kernel_signed, cl_kernel *kernel_float, + VulkanDevice &vkDevice) +{ + cl_int err = CL_SUCCESS; + size_t origin[3] = { 0, 0, 0 }; + size_t region[3] = { 1, 1, 1 }; + cl_kernel updateKernelCQ1; + std::vector<VulkanFormat> vkFormatList = getSupportedVulkanFormatList(); + const std::vector<VulkanExternalMemoryHandleType> + vkExternalMemoryHandleTypeList = + getSupportedVulkanExternalMemoryHandleTypeList(); + char magicValue = 0; + + VulkanBuffer vkParamsBuffer(vkDevice, sizeof(Params)); + VulkanDeviceMemory vkParamsDeviceMemory( + vkDevice, vkParamsBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkParamsDeviceMemory.bindBuffer(vkParamsBuffer); + + uint64_t maxImage2DSize = + max_width * max_height * MAX_2D_IMAGE_ELEMENT_SIZE * 2; + VulkanBuffer vkSrcBuffer(vkDevice, maxImage2DSize); + VulkanDeviceMemory vkSrcBufferDeviceMemory( + vkDevice, vkSrcBuffer.getSize(), + getVulkanMemoryType(vkDevice, + VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT)); + vkSrcBufferDeviceMemory.bindBuffer(vkSrcBuffer); + + char *srcBufferPtr, *dstBufferPtr; + srcBufferPtr = (char *)malloc(maxImage2DSize); + dstBufferPtr = (char *)malloc(maxImage2DSize); + + VulkanDescriptorSetLayoutBindingList vkDescriptorSetLayoutBindingList( + VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, + VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE, MAX_2D_IMAGE_DESCRIPTORS); + VulkanDescriptorSetLayout vkDescriptorSetLayout( + vkDevice, vkDescriptorSetLayoutBindingList); + VulkanPipelineLayout vkPipelineLayout(vkDevice, vkDescriptorSetLayout); + + VulkanDescriptorPool vkDescriptorPool(vkDevice, + vkDescriptorSetLayoutBindingList); + VulkanDescriptorSet vkDescriptorSet(vkDevice, vkDescriptorPool, + vkDescriptorSetLayout); + + VulkanCommandPool vkCommandPool(vkDevice); + VulkanCommandBuffer vkCopyCommandBuffer(vkDevice, vkCommandPool); + VulkanCommandBuffer vkShaderCommandBuffer(vkDevice, vkCommandPool); + VulkanQueue &vkQueue = vkDevice.getQueue(); + + VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType = + getSupportedVulkanExternalSemaphoreHandleTypeList()[0]; + VulkanSemaphore vkVk2CLSemaphore(vkDevice, vkExternalSemaphoreHandleType); + VulkanSemaphore vkCl2VkSemaphore(vkDevice, vkExternalSemaphoreHandleType); + clExternalSemaphore *clVk2CLExternalSemaphore = NULL; + clExternalSemaphore *clCl2VkExternalSemaphore = NULL; + + clVk2CLExternalSemaphore = new clExternalSemaphore( + vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + clCl2VkExternalSemaphore = new clExternalSemaphore( + vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId); + + std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory1; + std::vector<VulkanDeviceMemory *> vkNonDedicatedImage2DListDeviceMemory2; + std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory1; + std::vector<clExternalMemoryImage *> nonDedicatedExternalMemory2; + std::vector<char> vkImage2DShader; + + for (size_t fIdx = 0; fIdx < vkFormatList.size(); fIdx++) + { + VulkanFormat vkFormat = vkFormatList[fIdx]; + log_info("Format: %d\n", vkFormat); + uint32_t elementSize = getVulkanFormatElementSize(vkFormat); + ASSERT_LEQ(elementSize, (uint32_t)MAX_2D_IMAGE_ELEMENT_SIZE); + log_info("elementSize= %d\n", elementSize); + + std::string fileName = "image2D_" + + std::string(getVulkanFormatGLSLFormat(vkFormat)) + ".spv"; + log_info("Load %s file", fileName.c_str()); + vkImage2DShader = readFile(fileName); + VulkanShaderModule vkImage2DShaderModule(vkDevice, vkImage2DShader); + + VulkanComputePipeline vkComputePipeline(vkDevice, vkPipelineLayout, + vkImage2DShaderModule); + + for (size_t wIdx = 0; wIdx < ARRAY_SIZE(widthList); wIdx++) + { + uint32_t width = widthList[wIdx]; + log_info("Width: %d\n", width); + if (width > max_width) continue; + region[0] = width; + for (size_t hIdx = 0; hIdx < ARRAY_SIZE(heightList); hIdx++) + { + uint32_t height = heightList[hIdx]; + log_info("Height: %d\n", height); + if (height > max_height) continue; + region[1] = height; + + uint32_t numMipLevels = 1; + log_info("Number of mipmap levels: %d\n", numMipLevels); + + magicValue++; + char *vkSrcBufferDeviceMemoryPtr = + (char *)vkSrcBufferDeviceMemory.map(); + uint64_t srcBufSize = 0; + memset(vkSrcBufferDeviceMemoryPtr, 0, maxImage2DSize); + memset(srcBufferPtr, 0, maxImage2DSize); + uint32_t mipLevel = 0; + for (uint32_t row = 0; + row < std::max(height >> mipLevel, uint32_t(1)); row++) + { + for (uint32_t col = 0; + col < std::max(width >> mipLevel, uint32_t(1)); col++) + { + for (uint32_t elementByte = 0; + elementByte < elementSize; elementByte++) + { + vkSrcBufferDeviceMemoryPtr[srcBufSize] = + (char)(magicValue + mipLevel + row + col); + srcBufferPtr[srcBufSize] = + (char)(magicValue + mipLevel + row + col); + srcBufSize++; + } + } + } + srcBufSize = ROUND_UP( + srcBufSize, + std::max( + elementSize, + (uint32_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT)); + vkSrcBufferDeviceMemory.unmap(); + + for (size_t niIdx = 0; niIdx < ARRAY_SIZE(num2DImagesList); + niIdx++) + { + uint32_t num2DImages = num2DImagesList[niIdx]; + log_info("Number of images: %d\n", num2DImages); + ASSERT_LEQ(num2DImages, (uint32_t)MAX_2D_IMAGES); + + Params *params = (Params *)vkParamsDeviceMemory.map(); + uint32_t num_2D_image; + if (useSingleImageKernel) + { + num_2D_image = 1; + } + else + { + num_2D_image = num2DImages; + } + params->numImage2DDescriptors = num_2D_image * numMipLevels; + vkParamsDeviceMemory.unmap(); + vkDescriptorSet.update(0, vkParamsBuffer); + for (size_t emhtIdx = 0; + emhtIdx < vkExternalMemoryHandleTypeList.size(); + emhtIdx++) + { + VulkanExternalMemoryHandleType + vkExternalMemoryHandleType = + vkExternalMemoryHandleTypeList[emhtIdx]; + log_info("External memory handle type: %d \n", + vkExternalMemoryHandleType); + if ((true == disableNTHandleType) + && (VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT + == vkExternalMemoryHandleType)) + { + // Skip running for WIN32 NT handle. + continue; + } + VulkanImage2D vkDummyImage2D( + vkDevice, vkFormatList[0], widthList[0], + heightList[0], 1, vkExternalMemoryHandleType); + const VulkanMemoryTypeList &memoryTypeList = + vkDummyImage2D.getMemoryTypeList(); + + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); + mtIdx++) + { + const VulkanMemoryType &memoryType = + memoryTypeList[mtIdx]; + log_info("Memory type index: %d\n", + (uint32_t)memoryType); + log_info("Memory type property: %d\n", + memoryType.getMemoryTypeProperty()); + if (!useDeviceLocal) + { + if (VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL + == memoryType.getMemoryTypeProperty()) + { + continue; + } + } + size_t totalImageMemSize = 0; + uint64_t interImageOffset = 0; + { + VulkanImage2D vkImage2D( + vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + ASSERT_LEQ(vkImage2D.getSize(), maxImage2DSize); + totalImageMemSize = + ROUND_UP(vkImage2D.getSize(), + vkImage2D.getAlignment()); + } + VulkanImage2DList vkNonDedicatedImage2DList( + num2DImages, vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + for (size_t bIdx = 0; + bIdx < vkNonDedicatedImage2DList.size(); + bIdx++) + { + // Create list of Vulkan device memories and + // bind the list of Vulkan images. + vkNonDedicatedImage2DListDeviceMemory1 + .push_back(new VulkanDeviceMemory( + vkDevice, totalImageMemSize, memoryType, + vkExternalMemoryHandleType)); + vkNonDedicatedImage2DListDeviceMemory1[bIdx] + ->bindImage(vkNonDedicatedImage2DList[bIdx], + 0); + nonDedicatedExternalMemory1.push_back( + new clExternalMemoryImage( + *vkNonDedicatedImage2DListDeviceMemory1 + [bIdx], + vkExternalMemoryHandleType, context, + totalImageMemSize, width, height, 0, + vkNonDedicatedImage2DList[bIdx], + deviceId)); + } + VulkanImageViewList vkNonDedicatedImage2DViewList( + vkDevice, vkNonDedicatedImage2DList); + + VulkanImage2DList vkNonDedicatedImage2DList2( + num2DImages, vkDevice, vkFormat, width, height, + numMipLevels, vkExternalMemoryHandleType); + for (size_t bIdx = 0; + bIdx < vkNonDedicatedImage2DList2.size(); + bIdx++) + { + vkNonDedicatedImage2DListDeviceMemory2 + .push_back(new VulkanDeviceMemory( + vkDevice, totalImageMemSize, memoryType, + vkExternalMemoryHandleType)); + vkNonDedicatedImage2DListDeviceMemory2[bIdx] + ->bindImage( + vkNonDedicatedImage2DList2[bIdx], 0); + nonDedicatedExternalMemory2.push_back( + new clExternalMemoryImage( + *vkNonDedicatedImage2DListDeviceMemory2 + [bIdx], + vkExternalMemoryHandleType, context, + totalImageMemSize, width, height, 0, + vkNonDedicatedImage2DList2[bIdx], + deviceId)); + } + VulkanImageViewList vkDedicatedImage2DViewList( + vkDevice, vkNonDedicatedImage2DList2); + cl_mem external_mem_image1[4]; + cl_mem external_mem_image2[4]; + for (int i = 0; i < num2DImages; i++) + { + external_mem_image1[i] = + nonDedicatedExternalMemory1[i] + ->getExternalMemoryImage(); + external_mem_image2[i] = + nonDedicatedExternalMemory2[i] + ->getExternalMemoryImage(); + } + VulkanImage2DList &vkImage2DList = + vkNonDedicatedImage2DList; + VulkanImageViewList &vkImage2DViewList = + vkNonDedicatedImage2DViewList; + + clCl2VkExternalSemaphore->signal(cmd_queue1); + if (!useSingleImageKernel) + { + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); i2DIdx++) + { + for (uint32_t mipLevel = 0; + mipLevel < numMipLevels; mipLevel++) + { + uint32_t i2DvIdx = + (uint32_t)(i2DIdx * numMipLevels) + + mipLevel; + vkDescriptorSet.update( + 1 + i2DvIdx, + vkImage2DViewList[i2DvIdx]); + } + } + vkCopyCommandBuffer.begin(); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_UNDEFINED, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); i2DIdx++) + { + vkCopyCommandBuffer.copyBufferToImage( + vkSrcBuffer, vkImage2DList[i2DIdx], + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + } + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VULKAN_IMAGE_LAYOUT_GENERAL); + vkCopyCommandBuffer.end(); + memset(dstBufferPtr, 0, srcBufSize); + vkQueue.submit(vkCopyCommandBuffer); + vkShaderCommandBuffer.begin(); + vkShaderCommandBuffer.bindPipeline( + vkComputePipeline); + vkShaderCommandBuffer.bindDescriptorSets( + vkComputePipeline, vkPipelineLayout, + vkDescriptorSet); + vkShaderCommandBuffer.dispatch( + NUM_BLOCKS(width, NUM_THREADS_PER_GROUP_X), + NUM_BLOCKS(height, + NUM_THREADS_PER_GROUP_Y / 2), + 1); + vkShaderCommandBuffer.end(); + } + for (uint32_t iter = 0; iter < innerIterations; + iter++) + { + if (useSingleImageKernel) + { + for (size_t i2DIdx = 0; + i2DIdx < vkImage2DList.size(); + i2DIdx++) + { + vkDescriptorSet.update( + 1, vkImage2DViewList[i2DIdx]); + vkCopyCommandBuffer.begin(); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_UNDEFINED, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + vkCopyCommandBuffer.copyBufferToImage( + vkSrcBuffer, vkImage2DList[i2DIdx], + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + vkCopyCommandBuffer.pipelineBarrier( + vkImage2DList, + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + VULKAN_IMAGE_LAYOUT_GENERAL); + vkCopyCommandBuffer.end(); + memset(dstBufferPtr, 0, srcBufSize); + vkQueue.submit(vkCopyCommandBuffer); + vkShaderCommandBuffer.begin(); + vkShaderCommandBuffer.bindPipeline( + vkComputePipeline); + vkShaderCommandBuffer + .bindDescriptorSets( + vkComputePipeline, + vkPipelineLayout, + vkDescriptorSet); + vkShaderCommandBuffer.dispatch( + NUM_BLOCKS(width, + NUM_THREADS_PER_GROUP_X), + NUM_BLOCKS(height, + NUM_THREADS_PER_GROUP_Y + / 2), + 1); + vkShaderCommandBuffer.end(); + if (i2DIdx < vkImage2DList.size() - 1) + { + vkQueue.submit( + vkShaderCommandBuffer); + } + } + } + vkQueue.submit(vkCl2VkSemaphore, + vkShaderCommandBuffer, + vkVk2CLSemaphore); + clVk2CLExternalSemaphore->wait(cmd_queue1); + switch (num2DImages) + { + case 1: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[0], + kernel_signed[0], + kernel_unsigned[0]); + break; + case 2: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[1], + kernel_signed[1], + kernel_unsigned[1]); + break; + case 4: + updateKernelCQ1 = getKernelType( + vkFormat, kernel_float[2], + kernel_signed[2], + kernel_unsigned[2]); + break; + } + int j = 0; + for (int i = 0; i < num2DImages; i++, ++j) + { + err = clSetKernelArg( + updateKernelCQ1, j, sizeof(cl_mem), + &external_mem_image1[i]); + err |= clSetKernelArg( + updateKernelCQ1, ++j, sizeof(cl_mem), + &external_mem_image2[i]); + } + err |= clSetKernelArg(updateKernelCQ1, j, + sizeof(unsigned int), + &num2DImages); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &width); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &height); + err |= clSetKernelArg(updateKernelCQ1, ++j, + sizeof(unsigned int), + &numMipLevels); + + if (err != CL_SUCCESS) + { + print_error(err, + "Error: Failed to set arg " + "values for kernel-1\n"); + goto CLEANUP; + } + + size_t global_work_size[3] = { width, height, + 1 }; + err = clEnqueueNDRangeKernel( + cmd_queue1, updateKernelCQ1, 2, NULL, + global_work_size, NULL, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + goto CLEANUP; + } + clCl2VkExternalSemaphore->signal(cmd_queue1); + } + + unsigned int flags = 0; + size_t mipmapLevelOffset = 0; + cl_event eventReadImage = NULL; + for (int i = 0; i < num2DImages; i++) + { + err = clEnqueueReadImage( + cmd_queue1, external_mem_image2[i], CL_TRUE, + origin, region, 0, 0, dstBufferPtr, 0, NULL, + &eventReadImage); + + if (err != CL_SUCCESS) + { + print_error(err, + "clEnqueueReadImage failed with" + "error\n"); + } + + if (memcmp(srcBufferPtr, dstBufferPtr, + srcBufSize)) + { + log_info("Source and destination buffers " + "don't match\n"); + if (debug_trace) + { + log_info("Source buffer contents: \n"); + for (uint64_t sIdx = 0; + sIdx < srcBufSize; sIdx++) + { + log_info( + "%d", + (int)vkSrcBufferDeviceMemoryPtr + [sIdx]); + } + log_info( + "Destination buffer contents:"); + for (uint64_t dIdx = 0; + dIdx < srcBufSize; dIdx++) + { + log_info("%d", + (int)dstBufferPtr[dIdx]); + } + } + err = -1; + break; + } + } + for (int i = 0; i < num2DImages; i++) + { + delete vkNonDedicatedImage2DListDeviceMemory1 + [i]; + delete vkNonDedicatedImage2DListDeviceMemory2 + [i]; + delete nonDedicatedExternalMemory1[i]; + delete nonDedicatedExternalMemory2[i]; + } + vkNonDedicatedImage2DListDeviceMemory1.erase( + vkNonDedicatedImage2DListDeviceMemory1.begin(), + vkNonDedicatedImage2DListDeviceMemory1.begin() + + num2DImages); + vkNonDedicatedImage2DListDeviceMemory2.erase( + vkNonDedicatedImage2DListDeviceMemory2.begin(), + vkNonDedicatedImage2DListDeviceMemory2.begin() + + num2DImages); + nonDedicatedExternalMemory1.erase( + nonDedicatedExternalMemory1.begin(), + nonDedicatedExternalMemory1.begin() + + num2DImages); + nonDedicatedExternalMemory2.erase( + nonDedicatedExternalMemory2.begin(), + nonDedicatedExternalMemory2.begin() + + num2DImages); + if (CL_SUCCESS != err) + { + goto CLEANUP; + } + } + } + } + } + } + vkImage2DShader.clear(); + } +CLEANUP: + if (clVk2CLExternalSemaphore) delete clVk2CLExternalSemaphore; + if (clCl2VkExternalSemaphore) delete clCl2VkExternalSemaphore; + + if (srcBufferPtr) free(srcBufferPtr); + if (dstBufferPtr) free(dstBufferPtr); + return err; +} + +int test_image_common(cl_device_id device_, cl_context context_, + cl_command_queue queue_, int numElements_) +{ + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + cl_int err = CL_SUCCESS; + cl_platform_id platform = NULL; + size_t extensionSize = 0; + cl_uint num_devices = 0; + cl_uint device_no = 0; + cl_device_id *devices; + char *extensions = NULL; + const char *program_source_const; + cl_command_queue cmd_queue1 = NULL; + cl_command_queue cmd_queue2 = NULL; + cl_context context = NULL; + const uint32_t num_kernels = ARRAY_SIZE(num2DImagesList) + 1; + // One kernel for Cross-CQ case + const uint32_t num_kernel_types = 3; + const char *kernel_source[num_kernels] = { kernel_text_numImage_1, + kernel_text_numImage_2, + kernel_text_numImage_4 }; + char source_1[4096]; + char source_2[4096]; + char source_3[4096]; + size_t program_source_length; + cl_program program[num_kernel_types]; + cl_kernel kernel_float[num_kernels] = { NULL, NULL, NULL, NULL }; + cl_kernel kernel_signed[num_kernels] = { NULL, NULL, NULL, NULL }; + cl_kernel kernel_unsigned[num_kernels] = { NULL, NULL, NULL, NULL }; + cl_mem external_mem_image1; + cl_mem external_mem_image2; + + VulkanDevice vkDevice; + + cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, 0, 0 }; + // get the platform ID + err = clGetPlatformIDs(1, &platform, NULL); + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed to get platform\n"); + goto CLEANUP; + } + + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + if (CL_SUCCESS != err) + { + print_error(err, "clGetDeviceIDs failed in returning no. of devices\n"); + goto CLEANUP; + } + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + if (NULL == devices) + { + err = CL_OUT_OF_HOST_MEMORY; + print_error(err, "Unable to allocate memory for devices\n"); + goto CLEANUP; + } + err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, + NULL); + if (CL_SUCCESS != err) + { + print_error(err, "Failed to get deviceID.\n"); + goto CLEANUP; + } + contextProperties[1] = (cl_context_properties)platform; + log_info("Assigned contextproperties for platform\n"); + for (device_no = 0; device_no < num_devices; device_no++) + { + err = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0, NULL, + &extensionSize); + if (CL_SUCCESS != err) + { + print_error( + err, + "Error in clGetDeviceInfo for getting device_extension size\n"); + goto CLEANUP; + } + extensions = (char *)malloc(extensionSize); + if (NULL == extensions) + { + err = CL_OUT_OF_HOST_MEMORY; + print_error(err, "Unable to allocate memory for extensions\n"); + goto CLEANUP; + } + err = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, + extensionSize, extensions, NULL); + if (CL_SUCCESS != err) + { + print_error( + err, "Error in clGetDeviceInfo for getting device_extension\n"); + goto CLEANUP; + } + err = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR, + CL_UUID_SIZE_KHR, uuid, &extensionSize); + if (CL_SUCCESS != err) + { + print_error(err, "clGetDeviceInfo failed with error"); + goto CLEANUP; + } + err = + memcmp(uuid, vkDevice.getPhysicalDevice().getUUID(), VK_UUID_SIZE); + if (err == 0) + { + break; + } + } + if (device_no >= num_devices) + { + err = EXIT_FAILURE; + print_error(err, + "OpenCL error:" + "No Vulkan-OpenCL Interop capable GPU found.\n"); + goto CLEANUP; + } + deviceId = devices[device_no]; + err = setMaxImageDimensions(deviceId, max_width, max_height); + if (CL_SUCCESS != err) + { + print_error(err, "error setting max image dimensions"); + goto CLEANUP; + } + log_info("Set max_width to %lu and max_height to %lu\n", max_width, + max_height); + context = clCreateContextFromType(contextProperties, CL_DEVICE_TYPE_GPU, + NULL, NULL, &err); + if (CL_SUCCESS != err) + { + print_error(err, "error creating context"); + goto CLEANUP; + } + log_info("Successfully created context !!!\n"); + + cmd_queue1 = clCreateCommandQueue(context, devices[device_no], 0, &err); + if (CL_SUCCESS != err) + { + err = CL_INVALID_COMMAND_QUEUE; + print_error(err, "Error: Failed to create command queue!\n"); + goto CLEANUP; + } + log_info("clCreateCommandQueue successfull \n"); + + cmd_queue2 = clCreateCommandQueue(context, devices[device_no], 0, &err); + if (CL_SUCCESS != err) + { + err = CL_INVALID_COMMAND_QUEUE; + print_error(err, "Error: Failed to create command queue!\n"); + goto CLEANUP; + } + log_info("clCreateCommandQueue2 successful \n"); + + for (int i = 0; i < num_kernels; i++) + { + switch (i) + { + case 0: + sprintf(source_1, kernel_source[i], "float4", "f", "float4", + "f", "f", "f"); + sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i", + "i", "i"); + sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4", + "ui", "ui", "ui"); + break; + case 1: + sprintf(source_1, kernel_source[i], "float4", "f", "float4", + "f", "float4", "f", "float4", "f", "f", "f", "f", "f"); + sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i", + "int4", "i", "int4", "i", "i", "i", "i", "i"); + sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4", + "ui", "uint4", "ui", "uint4", "ui", "ui", "ui", "ui", + "ui"); + break; + case 2: + sprintf(source_1, kernel_source[i], "float4", "f", "float4", + "f", "float4", "f", "float4", "f", "float4", "f", + "float4", "f", "float4", "f", "float4", "f", "f", "f", + "f", "f", "f", "f", "f", "f"); + sprintf(source_2, kernel_source[i], "int4", "i", "int4", "i", + "int4", "i", "int4", "i", "int4", "i", "int4", "i", + "int4", "i", "int4", "i", "i", "i", "i", "i", "i", "i", + "i", "i"); + sprintf(source_3, kernel_source[i], "uint4", "ui", "uint4", + "ui", "uint4", "ui", "uint4", "ui", "uint4", "ui", + "uint4", "ui", "uint4", "ui", "uint4", "ui", "ui", "ui", + "ui", "ui", "ui", "ui", "ui", "ui"); + break; + case 3: + // Addtional case for creating updateKernelCQ2 which takes two + // images + sprintf(source_1, kernel_source[1], "float4", "f", "float4", + "f", "float4", "f", "float4", "f", "f", "f", "f", "f"); + sprintf(source_2, kernel_source[1], "int4", "i", "int4", "i", + "int4", "i", "int4", "i", "i", "i", "i", "i"); + sprintf(source_3, kernel_source[1], "uint4", "ui", "uint4", + "ui", "uint4", "ui", "uint4", "ui", "ui", "ui", "ui", + "ui"); + break; + } + const char *sourceTexts[num_kernel_types] = { source_1, source_2, + source_3 }; + for (int k = 0; k < num_kernel_types; k++) + { + program_source_length = strlen(sourceTexts[k]); + program[k] = clCreateProgramWithSource( + context, 1, &sourceTexts[k], &program_source_length, &err); + err |= clBuildProgram(program[k], 0, NULL, NULL, NULL, NULL); + } + + if (err != CL_SUCCESS) + { + print_error(err, "Error: Failed to build program"); + goto CLEANUP; + } + // create the kernel + kernel_float[i] = clCreateKernel(program[0], "image2DKernel", &err); + if (err != CL_SUCCESS) + { + print_error(err, "clCreateKernel failed"); + goto CLEANUP; + } + kernel_signed[i] = clCreateKernel(program[1], "image2DKernel", &err); + if (err != CL_SUCCESS) + { + print_error(err, "clCreateKernel failed"); + goto CLEANUP; + } + kernel_unsigned[i] = clCreateKernel(program[2], "image2DKernel", &err); + if (err != CL_SUCCESS) + { + print_error(err, "clCreateKernel failed "); + goto CLEANUP; + } + } + if (numCQ == 2) + { + err = run_test_with_two_queue(context, cmd_queue1, cmd_queue2, + kernel_unsigned, kernel_signed, + kernel_float, vkDevice); + } + else + { + err = run_test_with_one_queue(context, cmd_queue1, kernel_unsigned, + kernel_signed, kernel_float, vkDevice); + } +CLEANUP: + for (int i = 0; i < num_kernels; i++) + { + if (kernel_float[i]) + { + clReleaseKernel(kernel_float[i]); + } + if (kernel_unsigned[i]) + { + clReleaseKernel(kernel_unsigned[i]); + } + if (kernel_signed[i]) + { + clReleaseKernel(kernel_signed[i]); + } + } + for (int i = 0; i < num_kernel_types; i++) + { + if (program[i]) + { + clReleaseProgram(program[i]); + } + } + if (cmd_queue1) clReleaseCommandQueue(cmd_queue1); + if (cmd_queue2) clReleaseCommandQueue(cmd_queue2); + if (context) clReleaseContext(context); + + if (extensions) free(extensions); + if (devices) free(devices); + + return err; +} diff --git a/test_conformance/vulkan/test_vulkan_platform_device_info.cpp b/test_conformance/vulkan/test_vulkan_platform_device_info.cpp new file mode 100644 index 00000000..12f373b5 --- /dev/null +++ b/test_conformance/vulkan/test_vulkan_platform_device_info.cpp @@ -0,0 +1,146 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include <CL/cl.h> +#include <CL/cl_ext.h> +#include "harness/testHarness.h" +#include <iostream> +#include <string> + +typedef struct +{ + cl_uint info; + const char *name; +} _info; + +_info platform_info_table[] = { +#define STRING(x) \ + { \ + x, #x \ + } + STRING(CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR), + STRING(CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR), + STRING(CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR) +#undef STRING +}; + +_info device_info_table[] = { +#define STRING(x) \ + { \ + x, #x \ + } + STRING(CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR), + STRING(CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR), + STRING(CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR) +#undef STRING +}; + +int test_platform_info(cl_device_id deviceID, cl_context _context, + cl_command_queue _queue, int num_elements) +{ + cl_uint num_platforms; + cl_uint i, j; + cl_platform_id *platforms; + cl_int errNum; + cl_uint *handle_type; + size_t handle_type_size = 0; + cl_uint num_handles = 0; + + // get total # of platforms + errNum = clGetPlatformIDs(0, NULL, &num_platforms); + test_error(errNum, "clGetPlatformIDs (getting count) failed"); + + platforms = + (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id)); + if (!platforms) + { + printf("error allocating memory\n"); + exit(1); + } + log_info("%d platforms available\n", num_platforms); + errNum = clGetPlatformIDs(num_platforms, platforms, NULL); + test_error(errNum, "clGetPlatformIDs (getting IDs) failed"); + + for (i = 0; i < num_platforms; i++) + { + log_info("Platform%d (id %lu) info:\n", i, (unsigned long)platforms[i]); + for (j = 0; + j < sizeof(platform_info_table) / sizeof(platform_info_table[0]); + j++) + { + errNum = + clGetPlatformInfo(platforms[i], platform_info_table[j].info, 0, + NULL, &handle_type_size); + test_error(errNum, "clGetPlatformInfo failed"); + num_handles = handle_type_size / sizeof(cl_uint); + handle_type = (cl_uint *)malloc(handle_type_size); + errNum = + clGetPlatformInfo(platforms[i], platform_info_table[j].info, + handle_type_size, handle_type, NULL); + test_error(errNum, "clGetPlatformInfo failed"); + + log_info("%s: \n", platform_info_table[j].name); + while (num_handles--) + { + log_info("%x \n", handle_type[num_handles]); + } + if (handle_type) + { + free(handle_type); + } + } + } + if (platforms) + { + free(platforms); + } + return TEST_PASS; +} + +int test_device_info(cl_device_id deviceID, cl_context _context, + cl_command_queue _queue, int num_elements) +{ + cl_uint j; + cl_uint *handle_type; + size_t handle_type_size = 0; + cl_uint num_handles = 0; + cl_int errNum = CL_SUCCESS; + for (j = 0; j < sizeof(device_info_table) / sizeof(device_info_table[0]); + j++) + { + errNum = clGetDeviceInfo(deviceID, device_info_table[j].info, 0, NULL, + &handle_type_size); + test_error(errNum, "clGetDeviceInfo failed"); + + num_handles = handle_type_size / sizeof(cl_uint); + handle_type = (cl_uint *)malloc(handle_type_size); + + errNum = clGetDeviceInfo(deviceID, device_info_table[j].info, + handle_type_size, handle_type, NULL); + test_error(errNum, "clGetDeviceInfo failed"); + + log_info("%s: \n", device_info_table[j].name); + while (num_handles--) + { + log_info("%x \n", handle_type[num_handles]); + } + if (handle_type) + { + free(handle_type); + } + } + return TEST_PASS; +} diff --git a/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp new file mode 100644 index 00000000..9d9a6601 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.cpp @@ -0,0 +1,853 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include <CL/cl_ext.h> +#include "opencl_vulkan_wrapper.hpp" +#include "vulkan_wrapper.hpp" +#include "harness/errorHelpers.h" +#include "harness/deviceInfo.h" +#include <assert.h> +#include <iostream> +#include <stdexcept> + +#define ASSERT(x) assert((x)) +#define GB(x) ((unsigned long long)(x) << 30) + +pfnclCreateSemaphoreWithPropertiesKHR clCreateSemaphoreWithPropertiesKHRptr; +pfnclEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHRptr; +pfnclEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHRptr; +pfnclEnqueueAcquireExternalMemObjectsKHR + clEnqueueAcquireExternalMemObjectsKHRptr; +pfnclEnqueueReleaseExternalMemObjectsKHR + clEnqueueReleaseExternalMemObjectsKHRptr; +pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr; + +void init_cl_vk_ext(cl_platform_id opencl_platform) +{ + clEnqueueWaitSemaphoresKHRptr = + (pfnclEnqueueWaitSemaphoresKHR)clGetExtensionFunctionAddressForPlatform( + opencl_platform, "clEnqueueWaitSemaphoresKHR"); + if (NULL == clEnqueueWaitSemaphoresKHRptr) + { + throw std::runtime_error("Failed to get the function pointer of " + "clEnqueueWaitSemaphoresKHRptr!"); + } + clEnqueueSignalSemaphoresKHRptr = (pfnclEnqueueSignalSemaphoresKHR) + clGetExtensionFunctionAddressForPlatform( + opencl_platform, "clEnqueueSignalSemaphoresKHR"); + if (NULL == clEnqueueSignalSemaphoresKHRptr) + { + throw std::runtime_error("Failed to get the function pointer of " + "clEnqueueSignalSemaphoresKHRptr!"); + } + clReleaseSemaphoreKHRptr = + (pfnclReleaseSemaphoreKHR)clGetExtensionFunctionAddressForPlatform( + opencl_platform, "clReleaseSemaphoreKHR"); + if (NULL == clReleaseSemaphoreKHRptr) + { + throw std::runtime_error("Failed to get the function pointer of " + "clReleaseSemaphoreKHRptr!"); + } + clCreateSemaphoreWithPropertiesKHRptr = + (pfnclCreateSemaphoreWithPropertiesKHR) + clGetExtensionFunctionAddressForPlatform( + opencl_platform, "clCreateSemaphoreWithPropertiesKHR"); + if (NULL == clCreateSemaphoreWithPropertiesKHRptr) + { + throw std::runtime_error("Failed to get the function pointer of " + "clCreateSemaphoreWithPropertiesKHRptr!"); + } +} + +cl_int setMaxImageDimensions(cl_device_id deviceID, size_t &max_width, + size_t &max_height) +{ + cl_int result = CL_SUCCESS; + cl_ulong val; + size_t paramSize; + + result = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, + sizeof(cl_ulong), &val, ¶mSize); + + if (result != CL_SUCCESS) + { + return result; + } + + if (val < GB(4)) + { + max_width = 256; + max_height = 256; + } + else if (val < GB(8)) + { + max_width = 512; + max_height = 256; + } + else + { + max_width = 1024; + max_height = 512; + } + + return result; +} + +cl_int getCLFormatFromVkFormat(VkFormat vkFormat, + cl_image_format *clImageFormat) +{ + cl_int result = CL_SUCCESS; + switch (vkFormat) + { + case VK_FORMAT_R8G8B8A8_UNORM: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNORM_INT8; + break; + case VK_FORMAT_B8G8R8A8_UNORM: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_UNORM_INT8; + break; + case VK_FORMAT_R16G16B16A16_UNORM: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNORM_INT16; + break; + case VK_FORMAT_R8G8B8A8_SINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_SIGNED_INT8; + break; + case VK_FORMAT_R16G16B16A16_SINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_SIGNED_INT16; + break; + case VK_FORMAT_R32G32B32A32_SINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_SIGNED_INT32; + break; + case VK_FORMAT_R8G8B8A8_UINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8; + break; + case VK_FORMAT_R16G16B16A16_UINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16; + break; + case VK_FORMAT_R32G32B32A32_UINT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32; + break; + case VK_FORMAT_R16G16B16A16_SFLOAT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_HALF_FLOAT; + break; + case VK_FORMAT_R32G32B32A32_SFLOAT: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_FLOAT; + break; + case VK_FORMAT_R8_SNORM: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_SNORM_INT8; + break; + case VK_FORMAT_R16_SNORM: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_SNORM_INT16; + break; + case VK_FORMAT_R8_UNORM: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_UNORM_INT8; + break; + case VK_FORMAT_R16_UNORM: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_UNORM_INT16; + break; + case VK_FORMAT_R8_SINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_SIGNED_INT8; + break; + case VK_FORMAT_R16_SINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_SIGNED_INT16; + break; + case VK_FORMAT_R32_SINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_SIGNED_INT32; + break; + case VK_FORMAT_R8_UINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8; + break; + case VK_FORMAT_R16_UINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16; + break; + case VK_FORMAT_R32_UINT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32; + break; + case VK_FORMAT_R16_SFLOAT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_HALF_FLOAT; + break; + case VK_FORMAT_R32_SFLOAT: + clImageFormat->image_channel_order = CL_R; + clImageFormat->image_channel_data_type = CL_FLOAT; + break; + case VK_FORMAT_R8G8_SNORM: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_SNORM_INT8; + break; + case VK_FORMAT_R16G16_SNORM: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_SNORM_INT16; + break; + case VK_FORMAT_R8G8_UNORM: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_UNORM_INT8; + break; + case VK_FORMAT_R16G16_UNORM: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_UNORM_INT16; + break; + case VK_FORMAT_R8G8_SINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_SIGNED_INT8; + break; + case VK_FORMAT_R16G16_SINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_SIGNED_INT16; + break; + case VK_FORMAT_R32G32_SINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_SIGNED_INT32; + break; + case VK_FORMAT_R8G8_UINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8; + break; + case VK_FORMAT_R16G16_UINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT16; + break; + case VK_FORMAT_R32G32_UINT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT32; + break; + case VK_FORMAT_R16G16_SFLOAT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_HALF_FLOAT; + break; + case VK_FORMAT_R32G32_SFLOAT: + clImageFormat->image_channel_order = CL_RG; + clImageFormat->image_channel_data_type = CL_FLOAT; + break; + case VK_FORMAT_R5G6B5_UNORM_PACK16: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNORM_SHORT_565; + break; + case VK_FORMAT_R5G5B5A1_UNORM_PACK16: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_UNORM_SHORT_555; + break; + case VK_FORMAT_R8G8B8A8_SNORM: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_SNORM_INT8; + break; + case VK_FORMAT_R16G16B16A16_SNORM: + clImageFormat->image_channel_order = CL_RGBA; + clImageFormat->image_channel_data_type = CL_SNORM_INT16; + break; + case VK_FORMAT_B8G8R8A8_SNORM: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_SNORM_INT8; + break; + case VK_FORMAT_B5G6R5_UNORM_PACK16: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_UNORM_SHORT_565; + break; + case VK_FORMAT_B5G5R5A1_UNORM_PACK16: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_UNORM_SHORT_555; + break; + case VK_FORMAT_B8G8R8A8_SINT: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_SIGNED_INT8; + break; + case VK_FORMAT_B8G8R8A8_UINT: + clImageFormat->image_channel_order = CL_BGRA; + clImageFormat->image_channel_data_type = CL_UNSIGNED_INT8; + break; + case VK_FORMAT_A8B8G8R8_SNORM_PACK32: result = CL_INVALID_VALUE; break; + case VK_FORMAT_A8B8G8R8_UNORM_PACK32: result = CL_INVALID_VALUE; break; + case VK_FORMAT_A8B8G8R8_SINT_PACK32: result = CL_INVALID_VALUE; break; + case VK_FORMAT_A8B8G8R8_UINT_PACK32: result = CL_INVALID_VALUE; break; + default: + log_error("Unsupported format\n"); + ASSERT(0); + break; + } + return result; +} + +cl_mem_object_type getImageTypeFromVk(VkImageType imageType) +{ + cl_mem_object_type cl_image_type = CL_INVALID_VALUE; + switch (imageType) + { + case VK_IMAGE_TYPE_1D: cl_image_type = CL_MEM_OBJECT_IMAGE1D; break; + case VK_IMAGE_TYPE_2D: cl_image_type = CL_MEM_OBJECT_IMAGE2D; break; + case VK_IMAGE_TYPE_3D: cl_image_type = CL_MEM_OBJECT_IMAGE3D; break; + default: break; + } + return cl_image_type; +} + +size_t GetElementNBytes(const cl_image_format *format) +{ + size_t result; + + switch (format->image_channel_order) + { + case CL_R: + case CL_A: + case CL_INTENSITY: + case CL_LUMINANCE: + case CL_DEPTH: result = 1; break; + case CL_RG: + case CL_RA: result = 2; break; + case CL_RGB: result = 3; break; + case CL_RGBA: + case CL_ARGB: + case CL_BGRA: + case CL_sRGBA: result = 4; break; + default: result = 0; break; + } + + switch (format->image_channel_data_type) + { + case CL_SNORM_INT8: + case CL_UNORM_INT8: + case CL_SIGNED_INT8: + case CL_UNSIGNED_INT8: + // result *= 1; + break; + + case CL_SNORM_INT16: + case CL_UNORM_INT16: + case CL_SIGNED_INT16: + case CL_UNSIGNED_INT16: + case CL_HALF_FLOAT: result *= 2; break; + + case CL_SIGNED_INT32: + case CL_UNSIGNED_INT32: + case CL_FLOAT: result *= 4; break; + + case CL_UNORM_SHORT_565: + case CL_UNORM_SHORT_555: + if (result == 3) + { + result = 2; + } + else + { + result = 0; + } + break; + + case CL_UNORM_INT_101010: + if (result == 3) + { + result = 4; + } + else + { + result = 0; + } + break; + + default: result = 0; break; + } + + return result; +} + +cl_int get2DImageDimensions(const VkImageCreateInfo *VulkanImageCreateInfo, + cl_image_format *img_fmt, size_t totalImageSize, + size_t &width, size_t &height) +{ + cl_int result = CL_SUCCESS; + if (totalImageSize == 0) + { + result = CL_INVALID_VALUE; + } + size_t element_size = GetElementNBytes(img_fmt); + size_t row_pitch = element_size * VulkanImageCreateInfo->extent.width; + row_pitch = row_pitch % 64 == 0 ? row_pitch : ((row_pitch / 64) + 1) * 64; + + width = row_pitch / element_size; + height = totalImageSize / row_pitch; + + return result; +} + +cl_int +getCLImageInfoFromVkImageInfo(const VkImageCreateInfo *VulkanImageCreateInfo, + size_t totalImageSize, cl_image_format *img_fmt, + cl_image_desc *img_desc) +{ + cl_int result = CL_SUCCESS; + + cl_image_format clImgFormat = { 0 }; + result = + getCLFormatFromVkFormat(VulkanImageCreateInfo->format, &clImgFormat); + if (CL_SUCCESS != result) + { + return result; + } + memcpy(img_fmt, &clImgFormat, sizeof(cl_image_format)); + + img_desc->image_type = getImageTypeFromVk(VulkanImageCreateInfo->imageType); + if (CL_INVALID_VALUE == img_desc->image_type) + { + return CL_INVALID_VALUE; + } + + result = + get2DImageDimensions(VulkanImageCreateInfo, img_fmt, totalImageSize, + img_desc->image_width, img_desc->image_height); + if (CL_SUCCESS != result) + { + throw std::runtime_error("get2DImageDimensions failed!!!"); + } + + img_desc->image_depth = 0; // VulkanImageCreateInfo->extent.depth; + img_desc->image_array_size = 0; + img_desc->image_row_pitch = 0; // Row pitch set to zero as host_ptr is NULL + img_desc->image_slice_pitch = + img_desc->image_row_pitch * img_desc->image_height; + img_desc->num_mip_levels = 1; + img_desc->num_samples = 0; + img_desc->buffer = NULL; + + return result; +} + +cl_int check_external_memory_handle_type( + cl_device_id deviceID, + cl_external_memory_handle_type_khr requiredHandleType) +{ + unsigned int i; + cl_external_memory_handle_type_khr *handle_type; + size_t handle_type_size = 0; + + cl_int errNum = CL_SUCCESS; + + errNum = clGetDeviceInfo(deviceID, + CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, + 0, NULL, &handle_type_size); + handle_type = + (cl_external_memory_handle_type_khr *)malloc(handle_type_size); + + errNum = clGetDeviceInfo(deviceID, + CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, + handle_type_size, handle_type, NULL); + + test_error( + errNum, + "Unable to query CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR \n"); + + for (i = 0; i < handle_type_size; i++) + { + if (requiredHandleType == handle_type[i]) + { + return CL_SUCCESS; + } + } + log_error("cl_khr_external_memory extension is missing support for %d\n", + requiredHandleType); + + return CL_INVALID_VALUE; +} + +cl_int check_external_semaphore_handle_type( + cl_device_id deviceID, + cl_external_semaphore_handle_type_khr requiredHandleType) +{ + unsigned int i; + cl_external_semaphore_handle_type_khr *handle_type; + size_t handle_type_size = 0; + cl_int errNum = CL_SUCCESS; + + errNum = + clGetDeviceInfo(deviceID, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR, + 0, NULL, &handle_type_size); + handle_type = + (cl_external_semaphore_handle_type_khr *)malloc(handle_type_size); + + errNum = + clGetDeviceInfo(deviceID, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR, + handle_type_size, handle_type, NULL); + + test_error( + errNum, + "Unable to query CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR \n"); + + for (i = 0; i < handle_type_size; i++) + { + if (requiredHandleType == handle_type[i]) + { + return CL_SUCCESS; + } + } + log_error("cl_khr_external_semaphore extension is missing support for %d\n", + requiredHandleType); + + return CL_INVALID_VALUE; +} +clExternalMemory::clExternalMemory() {} + +clExternalMemory::clExternalMemory(const clExternalMemory &externalMemory) + : m_externalMemory(externalMemory.m_externalMemory) +{} + +clExternalMemory::clExternalMemory( + const VulkanDeviceMemory *deviceMemory, + VulkanExternalMemoryHandleType externalMemoryHandleType, uint64_t offset, + uint64_t size, cl_context context, cl_device_id deviceId) +{ + int err = 0; + m_externalMemory = NULL; + cl_device_id devList[] = { deviceId, NULL }; + std::vector<cl_mem_properties> extMemProperties; +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_memory_win32")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_memory_win32 extension\n"); + } +#else + if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_memory_opaque_fd " + "extension \n"); + } +#endif + + switch (externalMemoryHandleType) + { + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: +#ifdef _WIN32 + ASSERT(0); +#endif + log_info("Opaque file descriptors are not supported on Windows\n"); + fd = (int)deviceMemory->getHandle(externalMemoryHandleType); + err = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties.push_back((cl_mem_properties)fd); + break; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT: +#ifndef _WIN32 + ASSERT(0); +#else + log_info(" Opaque NT handles are only supported on Windows\n"); + handle = deviceMemory->getHandle(externalMemoryHandleType); + err = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties.push_back((cl_mem_properties)handle); +#endif + break; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: +#ifndef _WIN32 + ASSERT(0); +#else + log_info("Opaque D3DKMT handles are only supported on Windows\n"); + handle = deviceMemory->getHandle(externalMemoryHandleType); + err = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties.push_back( + (cl_mem_properties) + CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties.push_back((cl_mem_properties)handle); +#endif + break; + default: + ASSERT(0); + log_error("Unsupported external memory handle type\n"); + break; + } + if (CL_SUCCESS != err) + { + throw std::runtime_error("Unsupported external memory type\n "); + } + + extMemProperties.push_back((cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR); + extMemProperties.push_back((cl_mem_properties)devList[0]); + extMemProperties.push_back( + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR); + extMemProperties.push_back(0); + + m_externalMemory = clCreateBufferWithProperties( + context, extMemProperties.data(), 1, size, NULL, &err); + if (CL_SUCCESS != err) + { + log_error("clCreateBufferWithProperties failed with %d\n", err); + throw std::runtime_error("clCreateBufferWithProperties failed "); + } +} +clExternalMemoryImage::clExternalMemoryImage( + const VulkanDeviceMemory &deviceMemory, + VulkanExternalMemoryHandleType externalMemoryHandleType, cl_context context, + size_t totalImageMemSize, size_t imageWidth, size_t imageHeight, + size_t totalSize, const VulkanImage2D &image2D, cl_device_id deviceId) +{ + cl_int errcode_ret = 0; + std::vector<cl_mem_properties> extMemProperties1; + cl_device_id devList[] = { deviceId, NULL }; + +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_memory_win32")) + { + throw std::runtime_error("Device does not support " + "cl_khr_external_memory_win32 extension \n"); + } +#elif !defined(__APPLE__) + if (!is_extension_available(devList[0], "cl_khr_external_memory_opaque_fd")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_memory_opaque_fd " + "extension\n"); + } +#endif + + switch (externalMemoryHandleType) + { +#ifdef _WIN32 + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT: + log_info("Opaque NT handles are only supported on Windows\n"); + handle = deviceMemory.getHandle(externalMemoryHandleType); + errcode_ret = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties1.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR); + extMemProperties1.push_back((cl_mem_properties)handle); + break; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: + log_info("Opaque D3DKMT handles are only supported on Windows\n"); + handle = deviceMemory.getHandle(externalMemoryHandleType); + errcode_ret = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties1.push_back( + (cl_mem_properties) + CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR); + extMemProperties1.push_back((cl_mem_properties)handle); + break; +#elif !defined(__APPLE__) + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: + log_info(" Opaque file descriptors are not supported on Windows\n"); + fd = (int)deviceMemory.getHandle(externalMemoryHandleType); + errcode_ret = check_external_memory_handle_type( + devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties1.push_back( + (cl_mem_properties)CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR); + extMemProperties1.push_back((cl_mem_properties)fd); + break; +#endif + default: + ASSERT(0); + log_error("Unsupported external memory handle type\n"); + break; + } + if (CL_SUCCESS != errcode_ret) + { + throw std::runtime_error("Unsupported external memory type\n "); + } + // Set cl_image_desc + size_t clImageFormatSize; + cl_image_desc image_desc; + memset(&image_desc, 0x0, sizeof(cl_image_desc)); + cl_image_format img_format = { 0 }; + const VkImageCreateInfo VulkanImageCreateInfo = + image2D.getVkImageCreateInfo(); + + errcode_ret = getCLImageInfoFromVkImageInfo( + &VulkanImageCreateInfo, image2D.getSize(), &img_format, &image_desc); + if (CL_SUCCESS != errcode_ret) + { + throw std::runtime_error("getCLImageInfoFromVkImageInfo failed!!!"); + } + + extMemProperties1.push_back((cl_mem_properties)CL_DEVICE_HANDLE_LIST_KHR); + extMemProperties1.push_back((cl_mem_properties)devList[0]); + extMemProperties1.push_back( + (cl_mem_properties)CL_DEVICE_HANDLE_LIST_END_KHR); + extMemProperties1.push_back(0); + m_externalMemory = clCreateImageWithProperties( + context, extMemProperties1.data(), CL_MEM_READ_WRITE, &img_format, + &image_desc, NULL, &errcode_ret); + if (CL_SUCCESS != errcode_ret) + { + throw std::runtime_error("clCreateImageWithProperties failed!!!"); + } +} + +cl_mem clExternalMemory::getExternalMemoryBuffer() { return m_externalMemory; } + +cl_mem clExternalMemoryImage::getExternalMemoryImage() +{ + return m_externalMemory; +} + +clExternalMemoryImage::~clExternalMemoryImage() +{ + clReleaseMemObject(m_externalMemory); +} + +clExternalMemory::~clExternalMemory() { clReleaseMemObject(m_externalMemory); } + +clExternalMemoryImage::clExternalMemoryImage() {} + + +////////////////////////////////////////// +// clExternalSemaphore implementation // +////////////////////////////////////////// + +clExternalSemaphore::clExternalSemaphore( + const clExternalSemaphore &externalSemaphore) + : m_externalSemaphore(externalSemaphore.m_externalSemaphore) +{} + +clExternalSemaphore::clExternalSemaphore( + const VulkanSemaphore &semaphore, cl_context context, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType, + cl_device_id deviceId) +{ + + cl_int err = 0; + cl_device_id devList[] = { deviceId, NULL }; + +#ifdef _WIN32 + if (!is_extension_available(devList[0], "cl_khr_external_semaphore_win32")) + { + throw std::runtime_error("Device does not support " + "cl_khr_external_semaphore_win32 extension\n"); + } +#elif !defined(__APPLE__) + if (!is_extension_available(devList[0], + "cl_khr_external_semaphore_opaque_fd")) + { + throw std::runtime_error( + "Device does not support cl_khr_external_semaphore_opaque_fd " + "extension \n"); + } +#endif + + std::vector<cl_semaphore_properties_khr> sema_props{ + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR, + (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR, + }; + switch (externalSemaphoreHandleType) + { + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: +#ifdef _WIN32 + ASSERT(0); +#else + log_info(" Opaque file descriptors are not supported on Windows\n"); + fd = (int)semaphore.getHandle(externalSemaphoreHandleType); + err = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR); + sema_props.push_back( + (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR); + sema_props.push_back((cl_semaphore_properties_khr)fd); +#endif + break; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT: +#ifndef _WIN32 + ASSERT(0); +#else + log_info(" Opaque NT handles are only supported on Windows\n"); + handle = semaphore.getName().size() + ? NULL + : semaphore.getHandle(externalSemaphoreHandleType); + err = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR); + sema_props.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR); + sema_props.push_back((cl_semaphore_properties_khr)handle); +#endif + break; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT: +#ifndef _WIN32 + ASSERT(0); +#else + log_info(" Opaque D3DKMT handles are only supported on Windows\n"); + handle = semaphore.getHandle(externalSemaphoreHandleType); + err = check_external_semaphore_handle_type( + devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR); + sema_props.push_back((cl_semaphore_properties_khr) + CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR); + sema_props.push_back((cl_semaphore_properties_khr)handle); +#endif + break; + default: + ASSERT(0); + log_error("Unsupported external memory handle type\n"); + break; + } + if (CL_SUCCESS != err) + { + throw std::runtime_error( + "Unsupported external sempahore handle type\n "); + } + + sema_props.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_KHR); + sema_props.push_back((cl_semaphore_properties_khr)devList[0]); + sema_props.push_back( + (cl_semaphore_properties_khr)CL_DEVICE_HANDLE_LIST_END_KHR); + sema_props.push_back(0); + m_externalSemaphore = + clCreateSemaphoreWithPropertiesKHRptr(context, sema_props.data(), &err); + if (CL_SUCCESS != err) + { + log_error("clCreateSemaphoreWithPropertiesKHRptr failed with %d\n", + err); + throw std::runtime_error( + "clCreateSemaphoreWithPropertiesKHRptr failed! "); + } +} + +clExternalSemaphore::~clExternalSemaphore() +{ + cl_int err = clReleaseSemaphoreKHRptr(m_externalSemaphore); + if (err != CL_SUCCESS) + { + throw std::runtime_error("clReleaseSemaphoreKHR failed!"); + } +} + +void clExternalSemaphore::signal(cl_command_queue cmd_queue) +{ + clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore, NULL, 0, + NULL, NULL); +} + +void clExternalSemaphore::wait(cl_command_queue cmd_queue) +{ + clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore, NULL, 0, + NULL, NULL); +} diff --git a/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp new file mode 100644 index 00000000..d9f8dccb --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/opencl_vulkan_wrapper.hpp @@ -0,0 +1,131 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef _opencl_vulkan_wrapper_hpp_ +#define _opencl_vulkan_wrapper_hpp_ + +#include "vulkan_wrapper.hpp" + +#if !defined(__APPLE__) +#include <CL/cl.h> +#include <CL/cl_ext.h> +#else +#include <OpenCL/cl.h> +#include <OpenCL/cl_ext.h> +#endif + +typedef cl_semaphore_khr (*pfnclCreateSemaphoreWithPropertiesKHR)( + cl_context context, cl_semaphore_properties_khr *sema_props, + cl_int *errcode_ret); +typedef cl_int (*pfnclEnqueueWaitSemaphoresKHR)( + cl_command_queue command_queue, cl_uint num_semaphores, + const cl_semaphore_khr *sema_list, + const cl_semaphore_payload_khr *sema_payload_list, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event); +typedef cl_int (*pfnclEnqueueSignalSemaphoresKHR)( + cl_command_queue command_queue, cl_uint num_semaphores, + const cl_semaphore_khr *sema_list, + const cl_semaphore_payload_khr *sema_payload_list, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event); +typedef cl_int (*pfnclEnqueueAcquireExternalMemObjectsKHR)( + cl_command_queue command_queue, cl_uint num_mem_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); +typedef cl_int (*pfnclEnqueueReleaseExternalMemObjectsKHR)( + cl_command_queue command_queue, cl_uint num_mem_objects, + const cl_mem *mem_objects, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); +typedef cl_int (*pfnclReleaseSemaphoreKHR)(cl_semaphore_khr sema_object); + +extern pfnclCreateSemaphoreWithPropertiesKHR + clCreateSemaphoreWithPropertiesKHRptr; +extern pfnclEnqueueWaitSemaphoresKHR clEnqueueWaitSemaphoresKHRptr; +extern pfnclEnqueueSignalSemaphoresKHR clEnqueueSignalSemaphoresKHRptr; +extern pfnclEnqueueAcquireExternalMemObjectsKHR + clEnqueueAcquireExternalMemObjectsKHRptr; +extern pfnclEnqueueReleaseExternalMemObjectsKHR + clEnqueueReleaseExternalMemObjectsKHRptr; +extern pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr; + +cl_int getCLImageInfoFromVkImageInfo(const VkImageCreateInfo *, size_t, + cl_image_format *, cl_image_desc *); +cl_int check_external_memory_handle_type( + cl_device_id deviceID, + cl_external_memory_handle_type_khr requiredHandleType); +cl_int check_external_semaphore_handle_type( + cl_device_id deviceID, + cl_external_semaphore_handle_type_khr requiredHandleType); +cl_int setMaxImageDimensions(cl_device_id deviceID, size_t &width, + size_t &height); + +class clExternalMemory { +protected: + cl_mem m_externalMemory; + int fd; + void *handle; + clExternalMemory(const clExternalMemory &externalMemory); + +public: + clExternalMemory(); + clExternalMemory(const VulkanDeviceMemory *deviceMemory, + VulkanExternalMemoryHandleType externalMemoryHandleType, + uint64_t offset, uint64_t size, cl_context context, + cl_device_id deviceId); + + virtual ~clExternalMemory(); + cl_mem getExternalMemoryBuffer(); +}; +class clExternalMemoryImage { +protected: + cl_mem m_externalMemory; + int fd; + void *handle; + cl_command_queue cmd_queue; + clExternalMemoryImage(); + +public: + clExternalMemoryImage( + const VulkanDeviceMemory &deviceMemory, + VulkanExternalMemoryHandleType externalMemoryHandleType, + cl_context context, size_t totalImageMemSize, size_t imageWidth, + size_t imageHeight, size_t totalSize, const VulkanImage2D &image2D, + cl_device_id deviceId); + virtual ~clExternalMemoryImage(); + cl_mem getExternalMemoryImage(); +}; + +class clExternalSemaphore { +protected: + cl_semaphore_khr m_externalSemaphore; + int fd; + void *handle; + clExternalSemaphore(const clExternalSemaphore &externalSemaphore); + +public: + clExternalSemaphore( + const VulkanSemaphore &deviceSemaphore, cl_context context, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType, + cl_device_id deviceId); + virtual ~clExternalSemaphore(); + void signal(cl_command_queue command_queue); + void wait(cl_command_queue command_queue); + // operator openclExternalSemaphore_t() const; +}; + +extern void init_cl_vk_ext(cl_platform_id); + +#endif // _opencl_vulkan_wrapper_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp new file mode 100644 index 00000000..017aefd2 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_api_list.hpp @@ -0,0 +1,195 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _vulkan_api_list_hpp_ +#define _vulkan_api_list_hpp_ + +#define VK_FUNC_LIST \ + VK_FUNC_DECL(vkEnumerateInstanceVersion) \ + VK_FUNC_DECL(vkEnumerateInstanceExtensionProperties) \ + VK_FUNC_DECL(vkEnumerateInstanceLayerProperties) \ + VK_FUNC_DECL(vkCreateInstance) \ + VK_FUNC_DECL(vkGetInstanceProcAddr) \ + VK_FUNC_DECL(vkGetDeviceProcAddr) \ + VK_FUNC_DECL(vkEnumeratePhysicalDevices) \ + VK_FUNC_DECL(vkGetPhysicalDeviceProperties) \ + VK_FUNC_DECL(vkCreateDevice) \ + VK_FUNC_DECL(vkDestroyDevice) \ + VK_FUNC_DECL(vkGetDeviceQueue) \ + VK_FUNC_DECL(vkQueueWaitIdle) \ + VK_FUNC_DECL(vkCreateDescriptorSetLayout) \ + VK_FUNC_DECL(vkCreatePipelineLayout) \ + VK_FUNC_DECL(vkCreateShaderModule) \ + VK_FUNC_DECL(vkCreateComputePipelines) \ + VK_FUNC_DECL(vkCreateDescriptorPool) \ + VK_FUNC_DECL(vkAllocateDescriptorSets) \ + VK_FUNC_DECL(vkFreeDescriptorSets) \ + VK_FUNC_DECL(vkAllocateCommandBuffers) \ + VK_FUNC_DECL(vkBeginCommandBuffer) \ + VK_FUNC_DECL(vkCmdBindPipeline) \ + VK_FUNC_DECL(vkCmdBindDescriptorSets) \ + VK_FUNC_DECL(vkCmdPipelineBarrier) \ + VK_FUNC_DECL(vkCmdDispatch) \ + VK_FUNC_DECL(vkCmdFillBuffer) \ + VK_FUNC_DECL(vkCmdCopyBuffer) \ + VK_FUNC_DECL(vkCmdUpdateBuffer) \ + VK_FUNC_DECL(vkCmdCopyBufferToImage) \ + VK_FUNC_DECL(vkCmdCopyImageToBuffer) \ + VK_FUNC_DECL(vkEndCommandBuffer) \ + VK_FUNC_DECL(vkCreateBuffer) \ + VK_FUNC_DECL(vkCreateImageView) \ + VK_FUNC_DECL(vkAllocateMemory) \ + VK_FUNC_DECL(vkMapMemory) \ + VK_FUNC_DECL(vkBindBufferMemory) \ + VK_FUNC_DECL(vkBindImageMemory) \ + VK_FUNC_DECL(vkUnmapMemory) \ + VK_FUNC_DECL(vkFreeMemory) \ + VK_FUNC_DECL(vkCreateCommandPool) \ + VK_FUNC_DECL(vkResetCommandPool) \ + VK_FUNC_DECL(vkDestroyCommandPool) \ + VK_FUNC_DECL(vkResetCommandBuffer) \ + VK_FUNC_DECL(vkFreeCommandBuffers) \ + VK_FUNC_DECL(vkQueueSubmit) \ + VK_FUNC_DECL(vkCmdExecuteCommands) \ + VK_FUNC_DECL(vkCreateFence) \ + VK_FUNC_DECL(vkDestroyFence) \ + VK_FUNC_DECL(vkGetFenceStatus) \ + VK_FUNC_DECL(vkResetFences) \ + VK_FUNC_DECL(vkWaitForFences) \ + VK_FUNC_DECL(vkCreateSemaphore) \ + VK_FUNC_DECL(vkDestroySemaphore) \ + VK_FUNC_DECL(vkCreateEvent) \ + VK_FUNC_DECL(vkDestroyImageView) \ + VK_FUNC_DECL(vkCreateImage) \ + VK_FUNC_DECL(vkGetImageMemoryRequirements) \ + VK_FUNC_DECL(vkDestroyImage) \ + VK_FUNC_DECL(vkDestroyBuffer) \ + VK_FUNC_DECL(vkDestroyPipeline) \ + VK_FUNC_DECL(vkDestroyShaderModule) \ + VK_FUNC_DECL(vkGetPhysicalDeviceMemoryProperties) \ + VK_FUNC_DECL(vkDestroyInstance) \ + VK_FUNC_DECL(vkUpdateDescriptorSets) \ + VK_FUNC_DECL(vkDestroyDescriptorPool) \ + VK_FUNC_DECL(vkDestroyPipelineLayout) \ + VK_FUNC_DECL(vkDestroyDescriptorSetLayout) \ + VK_FUNC_DECL(vkGetPhysicalDeviceQueueFamilyProperties) \ + VK_FUNC_DECL(vkGetPhysicalDeviceFeatures) \ + VK_FUNC_DECL(vkGetPhysicalDeviceProperties2KHR) \ + VK_FUNC_DECL(vkGetBufferMemoryRequirements) \ + VK_FUNC_DECL(vkGetMemoryFdKHR) \ + VK_FUNC_DECL(vkGetSemaphoreFdKHR) \ + VK_FUNC_DECL(vkEnumeratePhysicalDeviceGroups) \ + VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceCapabilitiesKHR) \ + VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceFormatsKHR) \ + VK_FUNC_DECL(vkGetPhysicalDeviceSurfacePresentModesKHR) \ + VK_FUNC_DECL(vkEnumerateDeviceExtensionProperties) \ + VK_FUNC_DECL(vkGetPhysicalDeviceSurfaceSupportKHR) + +#define VK_WINDOWS_FUNC_LIST \ + VK_FUNC_DECL(vkGetMemoryWin32HandleKHR) \ + VK_FUNC_DECL(vkGetSemaphoreWin32HandleKHR) + +#define vkEnumerateInstanceVersion _vkEnumerateInstanceVersion +#define vkEnumerateInstanceExtensionProperties \ + _vkEnumerateInstanceExtensionProperties +#define vkEnumerateInstanceLayerProperties _vkEnumerateInstanceLayerProperties +#define vkCreateInstance _vkCreateInstance +#define vkGetInstanceProcAddr _vkGetInstanceProcAddr +#define vkGetDeviceProcAddr _vkGetDeviceProcAddr +#define vkEnumeratePhysicalDevices _vkEnumeratePhysicalDevices +#define vkGetPhysicalDeviceProperties _vkGetPhysicalDeviceProperties +#define vkCreateDevice _vkCreateDevice +#define vkDestroyDevice _vkDestroyDevice +#define vkGetDeviceQueue _vkGetDeviceQueue +#define vkQueueWaitIdle _vkQueueWaitIdle +#define vkCreateDescriptorSetLayout _vkCreateDescriptorSetLayout +#define vkCreatePipelineLayout _vkCreatePipelineLayout +#define vkCreateShaderModule _vkCreateShaderModule +#define vkCreateComputePipelines _vkCreateComputePipelines +#define vkCreateDescriptorPool _vkCreateDescriptorPool +#define vkAllocateDescriptorSets _vkAllocateDescriptorSets +#define vkFreeDescriptorSets _vkFreeDescriptorSets +#define vkAllocateCommandBuffers _vkAllocateCommandBuffers +#define vkBeginCommandBuffer _vkBeginCommandBuffer +#define vkCmdBindPipeline _vkCmdBindPipeline +#define vkCmdBindDescriptorSets _vkCmdBindDescriptorSets +#define vkCmdPipelineBarrier _vkCmdPipelineBarrier +#define vkCmdDispatch _vkCmdDispatch +#define vkCmdFillBuffer _vkCmdFillBuffer +#define vkCmdCopyBuffer _vkCmdCopyBuffer +#define vkCmdUpdateBuffer _vkCmdUpdateBuffer +#define vkCmdCopyBufferToImage _vkCmdCopyBufferToImage +#define vkCmdCopyImageToBuffer _vkCmdCopyImageToBuffer +#define vkEndCommandBuffer _vkEndCommandBuffer +#define vkCreateBuffer _vkCreateBuffer +#define vkCreateImageView _vkCreateImageView +#define vkAllocateMemory _vkAllocateMemory +#define vkMapMemory _vkMapMemory +#define vkBindBufferMemory _vkBindBufferMemory +#define vkBindImageMemory _vkBindImageMemory +#define vkUnmapMemory _vkUnmapMemory +#define vkFreeMemory _vkFreeMemory +#define vkCreateCommandPool _vkCreateCommandPool +#define vkResetCommandPool _vkResetCommandPool +#define vkDestroyCommandPool _vkDestroyCommandPool +#define vkResetCommandBuffer _vkResetCommandBuffer +#define vkFreeCommandBuffers _vkFreeCommandBuffers +#define vkQueueSubmit _vkQueueSubmit +#define vkCmdExecuteCommands _vkCmdExecuteCommands +#define vkCreateFence _vkCreateFence +#define vkDestroyFence _vkDestroyFence +#define vkGetFenceStatus _vkGetFenceStatus +#define vkResetFences _vkResetFences +#define vkWaitForFences _vkWaitForFences +#define vkCreateSemaphore _vkCreateSemaphore +#define vkDestroySemaphore _vkDestroySemaphore +#define vkCreateEvent _vkCreateEvent +#define vkDestroyImageView _vkDestroyImageView +#define vkCreateImage _vkCreateImage +#define vkGetImageMemoryRequirements _vkGetImageMemoryRequirements +#define vkDestroyImage _vkDestroyImage +#define vkDestroyBuffe _vkDestroyBuffer +#define vkDestroyPipeline _vkDestroyPipeline +#define vkDestroyShaderModule _vkDestroyShaderModule +#define vkGetPhysicalDeviceMemoryProperties _vkGetPhysicalDeviceMemoryProperties +#define vkDestroyInstance _vkDestroyInstance +#define vkUpdateDescriptorSets _vkUpdateDescriptorSets +#define vkDestroyDescriptorPool _vkDestroyDescriptorPool +#define vkDestroyPipelineLayout _vkDestroyPipelineLayout +#define vkDestroyDescriptorSetLayout _vkDestroyDescriptorSetLayout +#define vkGetPhysicalDeviceQueueFamilyProperties \ + _vkGetPhysicalDeviceQueueFamilyProperties +#define vkGetPhysicalDeviceFeatures _vkGetPhysicalDeviceFeatures +#define vkGetPhysicalDeviceProperties2KHR _vkGetPhysicalDeviceProperties2KHR +#define vkGetBufferMemoryRequirements _vkGetBufferMemoryRequirements +#define vkGetMemoryFdKHR _vkGetMemoryFdKHR +#define vkGetSemaphoreFdKHR _vkGetSemaphoreFdKHR +#define vkEnumeratePhysicalDeviceGroups _vkEnumeratePhysicalDeviceGroups +#define vkGetPhysicalDeviceSurfaceCapabilitiesKHR \ + _vkGetPhysicalDeviceSurfaceCapabilitiesKHR +#define vkGetPhysicalDeviceSurfaceFormatsKHR \ + _vkGetPhysicalDeviceSurfaceFormatsKHR +#define vkGetPhysicalDeviceSurfacePresentModesKHR \ + _vkGetPhysicalDeviceSurfacePresentModesKHR +#define vkEnumerateDeviceExtensionProperties \ + _vkEnumerateDeviceExtensionProperties +#define vkGetPhysicalDeviceSurfaceSupportKHR \ + _vkGetPhysicalDeviceSurfaceSupportKHR + +#define vkGetMemoryWin32HandleKHR _vkGetMemoryWin32HandleKHR +#define vkGetSemaphoreWin32HandleKHR _vkGetSemaphoreWin32HandleKHR + +#endif //_vulkan_api_list_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp new file mode 100644 index 00000000..db9d168f --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.cpp @@ -0,0 +1,22 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "vulkan_interop_common.hpp" + +uint32_t innerIterations(5); +uint32_t perfIterations(100); +uint32_t stressIterations(1000); +size_t cpuThreadsPerGpu(3); diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp new file mode 100644 index 00000000..18d84f09 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_interop_common.hpp @@ -0,0 +1,50 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _vulkan_interop_common_hpp_ +#define _vulkan_interop_common_hpp_ + +#include "vulkan_wrapper_types.hpp" +#include "vulkan_wrapper.hpp" +#include "vulkan_list_map.hpp" +#include "vulkan_utility.hpp" +#include "opencl_vulkan_wrapper.hpp" + +// Number of iterations for loops within tests (default value 5) +extern unsigned int innerIterations; +// Number of iterations for loops within perf tests (default value 100) +extern unsigned int perfIterations; +// Number of iterations for loops within stress tests (default value 1000) +extern unsigned int stressIterations; +// Number of CPU threads per GPU (default value 3) +extern size_t cpuThreadsPerGpu; +// Number of command queues (default value 1) +extern unsigned int numCQ; +// Enable Multi-import of vulkan device memory +extern bool multiImport; +// Enable Multi-import of vulkan device memory under different context +extern bool multiCtx; +// Enable additional debug info logging +extern bool debug_trace; + +extern bool useSingleImageKernel; +extern bool useDeviceLocal; +extern bool disableNTHandleType; +// Enable offset for multiImport of vulkan device memory +extern bool enableOffset; +extern bool non_dedicated; + +#endif // _vulkan_interop_common_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp new file mode 100644 index 00000000..bdae5d22 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.cpp @@ -0,0 +1,424 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef _WIN32 +#define NOMINMAX +#endif +#include "vulkan_list_map.hpp" +#include "vulkan_utility.hpp" +#include "vulkan_wrapper.hpp" + +///////////////////////////////////////////// +// VulkanPhysicalDeviceList implementation // +///////////////////////////////////////////// + +VulkanPhysicalDeviceList::VulkanPhysicalDeviceList( + const VulkanPhysicalDeviceList &physicalDeviceList) +{} + +VulkanPhysicalDeviceList::VulkanPhysicalDeviceList() {} + +VulkanPhysicalDeviceList::~VulkanPhysicalDeviceList() {} + +///////////////////////////////////////// +// VulkanMemoryHeapList implementation // +///////////////////////////////////////// + +VulkanMemoryHeapList::VulkanMemoryHeapList( + const VulkanMemoryHeapList &memoryHeapList) +{} + +VulkanMemoryHeapList::VulkanMemoryHeapList() {} + +VulkanMemoryHeapList::~VulkanMemoryHeapList() {} + +///////////////////////////////////////// +// VulkanMemoryTypeList implementation // +///////////////////////////////////////// + +VulkanMemoryTypeList::VulkanMemoryTypeList( + const VulkanMemoryTypeList &memoryTypeList) +{} + +VulkanMemoryTypeList::VulkanMemoryTypeList() {} + +VulkanMemoryTypeList::~VulkanMemoryTypeList() {} + +////////////////////////////////////////// +// VulkanQueueFamilyList implementation // +////////////////////////////////////////// + +VulkanQueueFamilyList::VulkanQueueFamilyList( + const VulkanQueueFamilyList &queueFamilyList) +{} + +VulkanQueueFamilyList::VulkanQueueFamilyList() {} + +VulkanQueueFamilyList::~VulkanQueueFamilyList() {} + +///////////////////////////////////////////////////// +// VulkanQueueFamilyToQueueCountMap implementation // +///////////////////////////////////////////////////// + +VulkanQueueFamilyToQueueCountMap::VulkanQueueFamilyToQueueCountMap( + const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap) +{} + +VulkanQueueFamilyToQueueCountMap::VulkanQueueFamilyToQueueCountMap( + uint32_t numQueuesPerFamily) +{ + uint32_t maxQueueFamilyCount = 0; + const VulkanPhysicalDeviceList &physicalDeviceList = + getVulkanInstance().getPhysicalDeviceList(); + for (size_t pdIdx = 0; pdIdx < physicalDeviceList.size(); pdIdx++) + { + maxQueueFamilyCount = std::max( + maxQueueFamilyCount, + (uint32_t)physicalDeviceList[pdIdx].getQueueFamilyList().size()); + } + + for (uint32_t qfIdx = 0; qfIdx < maxQueueFamilyCount; qfIdx++) + { + insert(qfIdx, numQueuesPerFamily); + } +} + +VulkanQueueFamilyToQueueCountMap::~VulkanQueueFamilyToQueueCountMap() {} + +//////////////////////////////////////////////////// +// VulkanQueueFamilyToQueueListMap implementation // +//////////////////////////////////////////////////// + +VulkanQueueFamilyToQueueListMap::VulkanQueueFamilyToQueueListMap( + const VulkanQueueFamilyToQueueListMap &queueFamilyToQueueMap) +{} + +VulkanQueueFamilyToQueueListMap::VulkanQueueFamilyToQueueListMap() {} + +VulkanQueueFamilyToQueueListMap::~VulkanQueueFamilyToQueueListMap() {} + +void VulkanQueueFamilyToQueueListMap::insert(uint32_t key, + VulkanQueueList &queueList) +{ + m_map.insert(std::pair<uint32_t, std::reference_wrapper<VulkanQueueList>>( + key, std::reference_wrapper<VulkanQueueList>(queueList))); +} + +VulkanQueueList &VulkanQueueFamilyToQueueListMap::operator[](uint32_t key) +{ + return m_map.at(key).get(); +} + +//////////////////////////////////// +// VulkanQueueList implementation // +//////////////////////////////////// + +VulkanQueueList::VulkanQueueList(const VulkanQueueList &queueList) {} + +VulkanQueueList::VulkanQueueList() {} + +VulkanQueueList::~VulkanQueueList() {} + +///////////////////////////////////////////////////////// +// VulkanDescriptorSetLayoutBindingList implementation // +///////////////////////////////////////////////////////// + +VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList( + const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList) +{} + +VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList() {} + +VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList( + size_t numDescriptorSetLayoutBindings, VulkanDescriptorType descriptorType, + uint32_t descriptorCount, VulkanShaderStage shaderStage) +{ + for (size_t idx = 0; idx < numDescriptorSetLayoutBindings; idx++) + { + VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding = + new VulkanDescriptorSetLayoutBinding((uint32_t)idx, descriptorType, + descriptorCount, shaderStage); + add(*descriptorSetLayoutBinding); + } +} + +VulkanDescriptorSetLayoutBindingList::VulkanDescriptorSetLayoutBindingList( + VulkanDescriptorType descriptorType0, uint32_t descriptorCount0, + VulkanDescriptorType descriptorType1, uint32_t descriptorCount1, + VulkanShaderStage shaderStage) +{ + for (uint32_t idx = 0; idx < descriptorCount0; idx++) + { + VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding0 = + new VulkanDescriptorSetLayoutBinding(idx, descriptorType0, 1, + shaderStage); + add(*descriptorSetLayoutBinding0); + } + for (uint32_t idx = 0; idx < descriptorCount1; idx++) + { + VulkanDescriptorSetLayoutBinding *descriptorSetLayoutBinding1 = + new VulkanDescriptorSetLayoutBinding( + descriptorCount0 + idx, descriptorType1, 1, shaderStage); + add(*descriptorSetLayoutBinding1); + } +} + +VulkanDescriptorSetLayoutBindingList::~VulkanDescriptorSetLayoutBindingList() +{ + for (size_t idx = 0; idx < m_wrapperList.size(); idx++) + { + VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding = + m_wrapperList[idx]; + delete &descriptorSetLayoutBinding; + } +} + +////////////////////////////////////////////////// +// VulkanDescriptorSetLayoutList implementation // +////////////////////////////////////////////////// + +VulkanDescriptorSetLayoutList::VulkanDescriptorSetLayoutList( + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList) +{} + +VulkanDescriptorSetLayoutList::VulkanDescriptorSetLayoutList() {} + +VulkanDescriptorSetLayoutList::~VulkanDescriptorSetLayoutList() {} + +//////////////////////////////////////////// +// VulkanCommandBufferList implementation // +//////////////////////////////////////////// + +VulkanCommandBufferList::VulkanCommandBufferList( + const VulkanCommandBufferList &commandBufferList) +{} + +VulkanCommandBufferList::VulkanCommandBufferList() {} + +VulkanCommandBufferList::VulkanCommandBufferList( + size_t numCommandBuffers, const VulkanDevice &device, + const VulkanCommandPool &commandPool) +{ + for (size_t idx = 0; idx < numCommandBuffers; idx++) + { + VulkanCommandBuffer *commandBuffer = + new VulkanCommandBuffer(device, commandPool); + add(*commandBuffer); + } +} + +VulkanCommandBufferList::~VulkanCommandBufferList() +{ + for (size_t idx = 0; idx < m_wrapperList.size(); idx++) + { + VulkanCommandBuffer &commandBuffer = m_wrapperList[idx]; + delete &commandBuffer; + } +} + +///////////////////////////////////// +// VulkanBufferList implementation // +///////////////////////////////////// + +VulkanBufferList::VulkanBufferList(const VulkanBufferList &bufferList) {} + +VulkanBufferList::VulkanBufferList( + size_t numBuffers, const VulkanDevice &device, uint64_t size, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanBufferUsage bufferUsage, VulkanSharingMode sharingMode, + const VulkanQueueFamilyList &queueFamilyList) +{ + for (size_t bIdx = 0; bIdx < numBuffers; bIdx++) + { + VulkanBuffer *buffer = + new VulkanBuffer(device, size, externalMemoryHandleType, + bufferUsage, sharingMode, queueFamilyList); + add(*buffer); + } +} + +VulkanBufferList::~VulkanBufferList() +{ + for (size_t bIdx = 0; bIdx < m_wrapperList.size(); bIdx++) + { + VulkanBuffer &buffer = m_wrapperList[bIdx]; + delete &buffer; + } +} + +////////////////////////////////////// +// VulkanImage2DList implementation // +////////////////////////////////////// + +VulkanImage2DList::VulkanImage2DList(const VulkanImage2DList &image2DList) {} + +VulkanImage2DList::VulkanImage2DList( + size_t numImages, std::vector<VulkanDeviceMemory *> &deviceMemory, + uint64_t baseOffset, uint64_t interImageOffset, const VulkanDevice &device, + VulkanFormat format, uint32_t width, uint32_t height, uint32_t mipLevels, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage, + VulkanSharingMode sharingMode) +{ + for (size_t i2DIdx = 0; i2DIdx < numImages; i2DIdx++) + { + VulkanImage2D *image2D = new VulkanImage2D( + device, format, width, height, mipLevels, externalMemoryHandleType, + imageCreateFlag, imageUsage, sharingMode); + add(*image2D); + deviceMemory[i2DIdx]->bindImage( + *image2D, baseOffset + (i2DIdx * interImageOffset)); + } +} + +VulkanImage2DList::VulkanImage2DList( + size_t numImages, const VulkanDevice &device, VulkanFormat format, + uint32_t width, uint32_t height, uint32_t mipLevels, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage, + VulkanSharingMode sharingMode) +{ + for (size_t bIdx = 0; bIdx < numImages; bIdx++) + { + VulkanImage2D *image2D = new VulkanImage2D( + device, format, width, height, mipLevels, externalMemoryHandleType, + imageCreateFlag, imageUsage, sharingMode); + add(*image2D); + } +} + +VulkanImage2DList::~VulkanImage2DList() +{ + for (size_t i2DIdx = 0; i2DIdx < m_wrapperList.size(); i2DIdx++) + { + VulkanImage2D &image2D = m_wrapperList[i2DIdx]; + delete &image2D; + } +} + +//////////////////////////////////////// +// VulkanImageViewList implementation // +//////////////////////////////////////// + +VulkanImageViewList::VulkanImageViewList(const VulkanImageViewList &image2DList) +{} + +VulkanImageViewList::VulkanImageViewList(const VulkanDevice &device, + const VulkanImage2DList &image2DList, + bool createImageViewPerMipLevel) +{ + for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++) + { + if (createImageViewPerMipLevel) + { + for (uint32_t mipLevel = 0; + mipLevel < image2DList[i2DIdx].getNumMipLevels(); mipLevel++) + { + VulkanImageView *image2DView = + new VulkanImageView(device, image2DList[i2DIdx], + VULKAN_IMAGE_VIEW_TYPE_2D, mipLevel, 1); + add(*image2DView); + } + } + else + { + VulkanImageView *image2DView = new VulkanImageView( + device, image2DList[i2DIdx], VULKAN_IMAGE_VIEW_TYPE_2D); + add(*image2DView); + } + } +} + +VulkanImageViewList::~VulkanImageViewList() +{ + for (size_t ivIdx = 0; ivIdx < m_wrapperList.size(); ivIdx++) + { + VulkanImageView &imageView = m_wrapperList[ivIdx]; + delete &imageView; + } +} + +/////////////////////////////////////////// +// VulkanDeviceMemoryList implementation // +/////////////////////////////////////////// + +VulkanDeviceMemoryList::VulkanDeviceMemoryList( + const VulkanDeviceMemoryList &deviceMemoryList) +{} + +VulkanDeviceMemoryList::VulkanDeviceMemoryList( + size_t numImages, const VulkanImage2DList &image2DList, + const VulkanDevice &device, const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType) +{ + for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++) + { + VulkanDeviceMemory *deviceMemory = new VulkanDeviceMemory( + device, image2DList[i2DIdx], memoryType, externalMemoryHandleType); + add(*deviceMemory); + deviceMemory->bindImage(image2DList[i2DIdx]); + } +} + +VulkanDeviceMemoryList::~VulkanDeviceMemoryList() +{ + for (size_t dmIdx = 0; dmIdx < m_wrapperList.size(); dmIdx++) + { + VulkanDeviceMemory &deviceMemory = m_wrapperList[dmIdx]; + delete &deviceMemory; + } +} + +//////////////////////////////////////// +// VulkanSemaphoreList implementation // +//////////////////////////////////////// + +VulkanSemaphoreList::VulkanSemaphoreList( + const VulkanSemaphoreList &semaphoreList) +{} + +VulkanSemaphoreList::VulkanSemaphoreList() {} + +VulkanSemaphoreList::VulkanSemaphoreList( + size_t numSemaphores, const VulkanDevice &device, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType, + const std::wstring namePrefix) +{ + std::wstring name = L""; + for (size_t idx = 0; idx < numSemaphores; idx++) + { + if (namePrefix.size()) + { + const size_t maxNameSize = 256; + wchar_t tempName[maxNameSize]; + swprintf(tempName, maxNameSize, L"%s%d", namePrefix.c_str(), + (int)idx); + name = tempName; + } + VulkanSemaphore *semaphore = + new VulkanSemaphore(device, externalSemaphoreHandleType, name); + add(*semaphore); + } +} + +VulkanSemaphoreList::~VulkanSemaphoreList() +{ + for (size_t idx = 0; idx < m_wrapperList.size(); idx++) + { + VulkanSemaphore &Semaphore = m_wrapperList[idx]; + delete &Semaphore; + } +} diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp new file mode 100644 index 00000000..52206779 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_list_map.hpp @@ -0,0 +1,386 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _vulkan_list_map_hpp_ +#define _vulkan_list_map_hpp_ + +#include <functional> +#include "vulkan_wrapper_types.hpp" +#include "vulkan_utility.hpp" +#include <iostream> +template <class VulkanWrapper, class VulkanNative> class VulkanList { +protected: + std::vector<std::reference_wrapper<VulkanWrapper>> m_wrapperList; + std::vector<std::reference_wrapper<const VulkanWrapper>> m_constWrapperList; + std::vector<VulkanNative> m_nativeList; + + VulkanList(const VulkanList &list); + VulkanList(); + virtual ~VulkanList(); + virtual void add(VulkanWrapper &wrapper); + +public: + virtual void add(const VulkanWrapper &wrapper); + virtual size_t size() const; + virtual const VulkanWrapper &operator[](size_t idx) const; + virtual VulkanWrapper &operator[](size_t idx); + virtual const VulkanNative *operator()() const; +}; + +template <class VulkanKey, class VulkanValue> class VulkanMap { +protected: + std::map<VulkanKey, VulkanValue> m_map; + + VulkanMap(const VulkanMap &map); + VulkanMap(); + virtual ~VulkanMap(); + +public: + void insert(const VulkanKey &key, VulkanValue &value); + const VulkanValue &operator[](const VulkanKey &key) const; + VulkanValue &operator[](const VulkanKey &key); +}; + +class VulkanPhysicalDeviceList + : public VulkanList<VulkanPhysicalDevice, VkPhysicalDevice> { + friend class VulkanInstance; + +protected: + VulkanPhysicalDeviceList( + const VulkanPhysicalDeviceList &physicalDeviceList); + +public: + VulkanPhysicalDeviceList(); + virtual ~VulkanPhysicalDeviceList(); +}; + +class VulkanQueueFamilyList : public VulkanList<VulkanQueueFamily, uint32_t> { + friend class VulkanPhysicalDevice; + +protected: + VulkanQueueFamilyList(const VulkanQueueFamilyList &queueFamilyList); + +public: + VulkanQueueFamilyList(); + virtual ~VulkanQueueFamilyList(); +}; + +class VulkanMemoryHeapList : public VulkanList<VulkanMemoryHeap, uint32_t> { + friend class VulkanPhysicalDevice; + +protected: + VulkanMemoryHeapList(const VulkanMemoryHeapList &memoryHeapList); + +public: + VulkanMemoryHeapList(); + virtual ~VulkanMemoryHeapList(); +}; + +class VulkanMemoryTypeList : public VulkanList<VulkanMemoryType, uint32_t> { + friend class VulkanPhysicalDevice; + friend class VulkanBuffer; + friend class VulkanImage; + +protected: + VulkanMemoryTypeList(const VulkanMemoryTypeList &memoryTypeList); + +public: + VulkanMemoryTypeList(); + virtual ~VulkanMemoryTypeList(); +}; + +class VulkanQueueFamilyToQueueCountMap : public VulkanMap<uint32_t, uint32_t> { +protected: + VulkanQueueFamilyToQueueCountMap( + const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap); + +public: + VulkanQueueFamilyToQueueCountMap(uint32_t numQueuesPerFamily = 0); + virtual ~VulkanQueueFamilyToQueueCountMap(); +}; + +class VulkanQueueList : public VulkanList<VulkanQueue, VkQueue> { + friend class VulkanDevice; + +protected: + VulkanQueueList(const VulkanQueueList &queueList); + +public: + VulkanQueueList(); + virtual ~VulkanQueueList(); +}; + +class VulkanQueueFamilyToQueueListMap + : public VulkanMap<uint32_t, std::reference_wrapper<VulkanQueueList>> { +protected: + VulkanQueueFamilyToQueueListMap( + const VulkanQueueFamilyToQueueListMap &queueFamilyToQueueMap); + +public: + VulkanQueueFamilyToQueueListMap(); + virtual ~VulkanQueueFamilyToQueueListMap(); + void insert(uint32_t key, VulkanQueueList &queueList); + VulkanQueueList &operator[](uint32_t key); +}; + +class VulkanDescriptorSetLayoutBindingList + : public VulkanList<VulkanDescriptorSetLayoutBinding, + VkDescriptorSetLayoutBinding> { +protected: + VulkanDescriptorSetLayoutBindingList( + const VulkanDescriptorSetLayoutBindingList + &descriptorSetLayoutBindingList); + +public: + VulkanDescriptorSetLayoutBindingList(); + VulkanDescriptorSetLayoutBindingList( + size_t numDescriptorSetLayoutBindings, + VulkanDescriptorType descriptorType, uint32_t descriptorCount = 1, + VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE); + VulkanDescriptorSetLayoutBindingList( + VulkanDescriptorType descriptorType0, uint32_t descriptorCount0, + VulkanDescriptorType descriptorType1, uint32_t descriptorCount1, + VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE); + virtual ~VulkanDescriptorSetLayoutBindingList(); +}; + +class VulkanDescriptorSetLayoutList + : public VulkanList<VulkanDescriptorSetLayout, VkDescriptorSetLayout> { +protected: + VulkanDescriptorSetLayoutList( + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList); + +public: + VulkanDescriptorSetLayoutList(); + virtual ~VulkanDescriptorSetLayoutList(); +}; + +class VulkanCommandBufferList + : public VulkanList<VulkanCommandBuffer, VkCommandBuffer> { +protected: + VulkanCommandBufferList(const VulkanCommandBufferList &commandBufferList); + +public: + VulkanCommandBufferList(); + VulkanCommandBufferList(size_t numCommandBuffers, + const VulkanDevice &device, + const VulkanCommandPool &commandPool); + virtual ~VulkanCommandBufferList(); +}; + +class VulkanBufferList : public VulkanList<VulkanBuffer, VkBuffer> { +protected: + VulkanBufferList(const VulkanBufferList &bufferList); + +public: + VulkanBufferList( + size_t numBuffers, const VulkanDevice &device, uint64_t size, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanBufferUsage bufferUsage = + VULKAN_BUFFER_USAGE_STORAGE_BUFFER_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE, + const VulkanQueueFamilyList &queueFamilyList = + getEmptyVulkanQueueFamilyList()); + virtual ~VulkanBufferList(); +}; + +class VulkanImage2DList : public VulkanList<VulkanImage2D, VkImage> { +protected: + VulkanImage2DList(const VulkanImage2DList &image2DList); + +public: + VulkanImage2DList( + size_t numImages, std::vector<VulkanDeviceMemory *> &deviceMemory, + uint64_t baseOffset, uint64_t interImageOffset, + const VulkanDevice &device, VulkanFormat format, uint32_t width, + uint32_t height, uint32_t mipLevels, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE, + VulkanImageUsage imageUsage = + VULKAN_IMAGE_USAGE_SAMPLED_STORAGE_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE); + VulkanImage2DList( + size_t numImages, const VulkanDevice &device, VulkanFormat format, + uint32_t width, uint32_t height, uint32_t mipLevels = 1, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE, + VulkanImageUsage imageUsage = + VULKAN_IMAGE_USAGE_SAMPLED_STORAGE_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE); + virtual ~VulkanImage2DList(); +}; + +class VulkanImageViewList : public VulkanList<VulkanImageView, VkImageView> { +protected: + VulkanImageViewList(const VulkanImageViewList &imageViewList); + +public: + VulkanImageViewList(const VulkanDevice &device, + const VulkanImage2DList &image2DList, + bool createImageViewPerMipLevel = true); + virtual ~VulkanImageViewList(); +}; + +class VulkanDeviceMemoryList + : public VulkanList<VulkanDeviceMemory, VkDeviceMemory> { +protected: + VulkanDeviceMemoryList(const VulkanDeviceMemoryList &deviceMemoryList); + +public: + VulkanDeviceMemoryList( + size_t numImages, const VulkanImage2DList &image2DList, + const VulkanDevice &device, const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE); + virtual ~VulkanDeviceMemoryList(); +}; + +class VulkanSemaphoreList : public VulkanList<VulkanSemaphore, VkSemaphore> { +protected: + VulkanSemaphoreList(const VulkanSemaphoreList &semaphoreList); + +public: + VulkanSemaphoreList(); + VulkanSemaphoreList( + size_t numSemaphores, const VulkanDevice &device, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType = + VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE, + const std::wstring namePrefix = L""); + virtual ~VulkanSemaphoreList(); +}; + +/////////////////////////////// +// VulkanList implementation // +/////////////////////////////// + +template <class VulkanWrapper, class VulkanNative> +VulkanList<VulkanWrapper, VulkanNative>::VulkanList(const VulkanList &list) + : m_wrapperList(list.m_wrapperList), + m_constWrapperList(list.m_constWrapperList), + m_nativeList(list.m_nativeList) +{} + +template <class VulkanWrapper, class VulkanNative> +VulkanList<VulkanWrapper, VulkanNative>::VulkanList() +{} + +template <class VulkanWrapper, class VulkanNative> +VulkanList<VulkanWrapper, VulkanNative>::~VulkanList() +{} + +template <class VulkanWrapper, class VulkanNative> +void VulkanList<VulkanWrapper, VulkanNative>::add(VulkanWrapper &wrapper) +{ + + if (m_constWrapperList.size() != size_t(0)) + { + std::cout << "This list can only contain externally allocated objects" + << std::endl; + return; + } + m_wrapperList.push_back(std::reference_wrapper<VulkanWrapper>(wrapper)); + m_nativeList.push_back((VulkanNative)wrapper); +} + +template <class VulkanWrapper, class VulkanNative> +void VulkanList<VulkanWrapper, VulkanNative>::add(const VulkanWrapper &wrapper) +{ + if (m_wrapperList.size() != size_t(0)) + { + std::cout << "This list cannot contain externally allocated objects" + << std::endl; + return; + } + + m_constWrapperList.push_back( + std::reference_wrapper<const VulkanWrapper>(wrapper)); + m_nativeList.push_back((VulkanNative)wrapper); +} + +template <class VulkanWrapper, class VulkanNative> +size_t VulkanList<VulkanWrapper, VulkanNative>::size() const +{ + return (m_wrapperList.size() > 0) ? m_wrapperList.size() + : m_constWrapperList.size(); +} + +template <class VulkanWrapper, class VulkanNative> +const VulkanWrapper & + VulkanList<VulkanWrapper, VulkanNative>::operator[](size_t idx) const +{ + if (idx < size()) + { + // CHECK_LT(idx, size()); + return (m_wrapperList.size() > 0) ? m_wrapperList[idx].get() + : m_constWrapperList[idx].get(); + } +} + +template <class VulkanWrapper, class VulkanNative> +VulkanWrapper &VulkanList<VulkanWrapper, VulkanNative>::operator[](size_t idx) +{ + // CHECK_LT(idx, m_wrapperList.size()); + return m_wrapperList[idx].get(); +} + +template <class VulkanWrapper, class VulkanNative> +const VulkanNative *VulkanList<VulkanWrapper, VulkanNative>::operator()() const +{ + return m_nativeList.data(); +} + +////////////////////////////// +// VulkanMap implementation // +////////////////////////////// + +template <class VulkanKey, class VulkanValue> +VulkanMap<VulkanKey, VulkanValue>::VulkanMap(const VulkanMap &map) + : m_map(map.m_map) +{} + +template <class VulkanKey, class VulkanValue> +VulkanMap<VulkanKey, VulkanValue>::VulkanMap() +{} + +template <class VulkanKey, class VulkanValue> +VulkanMap<VulkanKey, VulkanValue>::~VulkanMap() +{} + +template <class VulkanKey, class VulkanValue> +void VulkanMap<VulkanKey, VulkanValue>::insert(const VulkanKey &key, + VulkanValue &value) +{ + m_map.insert(std::pair<VulkanKey, std::reference_wrapper<VulkanValue>>( + key, std::reference_wrapper<VulkanValue>(value))); +} + +template <class VulkanKey, class VulkanValue> +const VulkanValue & + VulkanMap<VulkanKey, VulkanValue>::operator[](const VulkanKey &key) const +{ + return m_map.at(key); +} + +template <class VulkanKey, class VulkanValue> +VulkanValue &VulkanMap<VulkanKey, VulkanValue>::operator[](const VulkanKey &key) +{ + return m_map.at(key); +} + +#endif // _vulkan_list_map_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp new file mode 100644 index 00000000..1a313cce --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.cpp @@ -0,0 +1,692 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "vulkan_utility.hpp" +#include "vulkan_wrapper.hpp" +#include <assert.h> +#include <iostream> +#include <fstream> +#include <set> +#include <string> +#include <CL/cl.h> +#include <CL/cl_ext.h> +#if defined(_WIN32) || defined(_WIN64) +#include <versionhelpers.h> +#endif +#define ASSERT(x) assert((x)) +#define BUFFERSIZE 3000 + + +const VulkanInstance &getVulkanInstance() +{ + static VulkanInstance instance; + return instance; +} + +const VulkanPhysicalDevice &getVulkanPhysicalDevice() +{ + size_t pdIdx; + cl_int errNum = 0; + cl_platform_id platform = NULL; + cl_uchar uuid[CL_UUID_SIZE_KHR]; + cl_device_id *devices; + char *extensions = NULL; + size_t extensionSize = 0; + cl_uint num_devices = 0; + cl_uint device_no = 0; + const size_t bufsize = BUFFERSIZE; + char buf[BUFFERSIZE]; + const VulkanInstance &instance = getVulkanInstance(); + const VulkanPhysicalDeviceList &physicalDeviceList = + instance.getPhysicalDeviceList(); + + // get the platform ID + errNum = clGetPlatformIDs(1, &platform, NULL); + if (errNum != CL_SUCCESS) + { + printf("Error: Failed to get platform\n"); + throw std::runtime_error("Error: Failed to get number of platform\n"); + } + + errNum = + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); + if (CL_SUCCESS != errNum) + { + throw std::runtime_error( + "Error: clGetDeviceIDs failed in returning of devices\n"); + } + devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); + if (NULL == devices) + { + throw std::runtime_error( + "Error: Unable to allocate memory for devices\n"); + } + errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, + NULL); + if (CL_SUCCESS != errNum) + { + throw std::runtime_error("Error: Failed to get deviceID.\n"); + } + bool is_selected = false; + for (device_no = 0; device_no < num_devices; device_no++) + { + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, 0, + NULL, &extensionSize); + if (CL_SUCCESS != errNum) + { + throw std::runtime_error("Error in clGetDeviceInfo for getting " + "device_extension size....\n"); + } + extensions = (char *)malloc(extensionSize); + if (NULL == extensions) + { + throw std::runtime_error( + "Unable to allocate memory for extensions\n"); + } + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_EXTENSIONS, + extensionSize, extensions, NULL); + if (CL_SUCCESS != errNum) + { + throw std::runtime_error("Error: Error in clGetDeviceInfo for " + "getting device_extension\n"); + } + errNum = clGetDeviceInfo(devices[device_no], CL_DEVICE_UUID_KHR, + CL_UUID_SIZE_KHR, uuid, &extensionSize); + if (CL_SUCCESS != errNum) + { + throw std::runtime_error( + "Error: clGetDeviceInfo failed with error\n"); + } + free(extensions); + for (pdIdx = 0; pdIdx < physicalDeviceList.size(); pdIdx++) + { + if (!memcmp(&uuid, physicalDeviceList[pdIdx].getUUID(), + VK_UUID_SIZE)) + { + std::cout << "Selected physical device = " + << physicalDeviceList[pdIdx] << std::endl; + is_selected = true; + break; + } + } + if (is_selected) + { + break; + } + } + + if ((pdIdx >= physicalDeviceList.size()) + || (physicalDeviceList[pdIdx] == (VkPhysicalDevice)VK_NULL_HANDLE)) + { + throw std::runtime_error("failed to find a suitable GPU!"); + } + std::cout << "Selected physical device is: " << physicalDeviceList[pdIdx] + << std::endl; + return physicalDeviceList[pdIdx]; +} + +const VulkanQueueFamily &getVulkanQueueFamily(uint32_t queueFlags) +{ + size_t qfIdx; + const VulkanPhysicalDevice &physicalDevice = getVulkanPhysicalDevice(); + const VulkanQueueFamilyList &queueFamilyList = + physicalDevice.getQueueFamilyList(); + + for (qfIdx = 0; qfIdx < queueFamilyList.size(); qfIdx++) + { + if ((queueFamilyList[qfIdx].getQueueFlags() & queueFlags) == queueFlags) + { + break; + } + } + + return queueFamilyList[qfIdx]; +} + +const VulkanMemoryType & +getVulkanMemoryType(const VulkanDevice &device, + VulkanMemoryTypeProperty memoryTypeProperty) +{ + size_t mtIdx; + const VulkanMemoryTypeList &memoryTypeList = + device.getPhysicalDevice().getMemoryTypeList(); + + for (mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + if ((memoryTypeList[mtIdx].getMemoryTypeProperty() & memoryTypeProperty) + == memoryTypeProperty) + { + break; + } + } + + // CHECK_LT(mtIdx, memoryTypeList.size()); + return memoryTypeList[mtIdx]; +} + +bool checkVkSupport() +{ + bool result = true; + const VulkanInstance &instance = getVulkanInstance(); + const VulkanPhysicalDeviceList &physicalDeviceList = + instance.getPhysicalDeviceList(); + if (physicalDeviceList() == NULL) + { + std::cout << "physicalDeviceList is null, No GPUs found with " + "Vulkan support !!!\n"; + result = false; + } + return result; +} + +const VulkanQueueFamilyList &getEmptyVulkanQueueFamilyList() +{ + static VulkanQueueFamilyList queueFamilyList; + return queueFamilyList; +} + +const VulkanDescriptorSetLayoutList &getEmptyVulkanDescriptorSetLayoutList() +{ + static VulkanDescriptorSetLayoutList descriptorSetLayoutList; + + return descriptorSetLayoutList; +} + +const VulkanQueueFamilyToQueueCountMap & +getDefaultVulkanQueueFamilyToQueueCountMap() +{ + static VulkanQueueFamilyToQueueCountMap queueFamilyToQueueCountMap(1); + + return queueFamilyToQueueCountMap; +} + +const std::vector<VulkanExternalMemoryHandleType> +getSupportedVulkanExternalMemoryHandleTypeList() +{ + std::vector<VulkanExternalMemoryHandleType> externalMemoryHandleTypeList; + +#if _WIN32 + if (IsWindows8OrGreater()) + { + externalMemoryHandleTypeList.push_back( + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT); + } + externalMemoryHandleTypeList.push_back( + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT); +#else + externalMemoryHandleTypeList.push_back( + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD); +#endif + + return externalMemoryHandleTypeList; +} + +const std::vector<VulkanExternalSemaphoreHandleType> +getSupportedVulkanExternalSemaphoreHandleTypeList() +{ + std::vector<VulkanExternalSemaphoreHandleType> + externalSemaphoreHandleTypeList; + +#if _WIN32 + if (IsWindows8OrGreater()) + { + externalSemaphoreHandleTypeList.push_back( + VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT); + } + externalSemaphoreHandleTypeList.push_back( + VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT); +#else + externalSemaphoreHandleTypeList.push_back( + VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD); +#endif + + return externalSemaphoreHandleTypeList; +} + +const std::vector<VulkanFormat> getSupportedVulkanFormatList() +{ + std::vector<VulkanFormat> formatList; + + formatList.push_back(VULKAN_FORMAT_R8_UINT); + formatList.push_back(VULKAN_FORMAT_R8_SINT); + formatList.push_back(VULKAN_FORMAT_R8G8_UINT); + formatList.push_back(VULKAN_FORMAT_R8G8_SINT); + formatList.push_back(VULKAN_FORMAT_R8G8B8A8_UINT); + formatList.push_back(VULKAN_FORMAT_R8G8B8A8_SINT); + formatList.push_back(VULKAN_FORMAT_R16_UINT); + formatList.push_back(VULKAN_FORMAT_R16_SINT); + formatList.push_back(VULKAN_FORMAT_R16G16_UINT); + formatList.push_back(VULKAN_FORMAT_R16G16_SINT); + formatList.push_back(VULKAN_FORMAT_R16G16B16A16_UINT); + formatList.push_back(VULKAN_FORMAT_R16G16B16A16_SINT); + formatList.push_back(VULKAN_FORMAT_R32_UINT); + formatList.push_back(VULKAN_FORMAT_R32_SINT); + formatList.push_back(VULKAN_FORMAT_R32_SFLOAT); + formatList.push_back(VULKAN_FORMAT_R32G32_UINT); + formatList.push_back(VULKAN_FORMAT_R32G32_SINT); + formatList.push_back(VULKAN_FORMAT_R32G32_SFLOAT); + formatList.push_back(VULKAN_FORMAT_R32G32B32A32_UINT); + formatList.push_back(VULKAN_FORMAT_R32G32B32A32_SINT); + formatList.push_back(VULKAN_FORMAT_R32G32B32A32_SFLOAT); + + for (size_t fIdx = 0; fIdx < formatList.size(); fIdx++) + { + switch (formatList[fIdx]) + { + case VULKAN_FORMAT_R8_UINT: + case VULKAN_FORMAT_R8_SINT: + case VULKAN_FORMAT_R8G8_UINT: + case VULKAN_FORMAT_R8G8_SINT: + case VULKAN_FORMAT_R8G8B8A8_UINT: + case VULKAN_FORMAT_R8G8B8A8_SINT: + case VULKAN_FORMAT_R16_UINT: + case VULKAN_FORMAT_R16_SINT: + case VULKAN_FORMAT_R16G16_UINT: + case VULKAN_FORMAT_R16G16_SINT: + case VULKAN_FORMAT_R16G16B16A16_UINT: + case VULKAN_FORMAT_R16G16B16A16_SINT: + case VULKAN_FORMAT_R32_UINT: + case VULKAN_FORMAT_R32_SINT: + case VULKAN_FORMAT_R32_SFLOAT: + case VULKAN_FORMAT_R32G32_UINT: + case VULKAN_FORMAT_R32G32_SINT: + case VULKAN_FORMAT_R32G32_SFLOAT: + case VULKAN_FORMAT_R32G32B32A32_UINT: + case VULKAN_FORMAT_R32G32B32A32_SINT: + case VULKAN_FORMAT_R32G32B32A32_SFLOAT: break; + + case VULKAN_FORMAT_UNDEFINED: + case VULKAN_FORMAT_R4G4_UNORM_PACK8: + case VULKAN_FORMAT_R4G4B4A4_UNORM_PACK16: + case VULKAN_FORMAT_B4G4R4A4_UNORM_PACK16: + case VULKAN_FORMAT_R5G6B5_UNORM_PACK16: + case VULKAN_FORMAT_B5G6R5_UNORM_PACK16: + case VULKAN_FORMAT_R5G5B5A1_UNORM_PACK16: + case VULKAN_FORMAT_B5G5R5A1_UNORM_PACK16: + case VULKAN_FORMAT_A1R5G5B5_UNORM_PACK16: + case VULKAN_FORMAT_R8_UNORM: + case VULKAN_FORMAT_R8_SNORM: + case VULKAN_FORMAT_R8_USCALED: + case VULKAN_FORMAT_R8_SSCALED: + case VULKAN_FORMAT_R8_SRGB: + case VULKAN_FORMAT_R8G8_SNORM: + case VULKAN_FORMAT_R8G8_UNORM: + case VULKAN_FORMAT_R8G8_USCALED: + case VULKAN_FORMAT_R8G8_SSCALED: + case VULKAN_FORMAT_R8G8_SRGB: + case VULKAN_FORMAT_R8G8B8_UNORM: + case VULKAN_FORMAT_R8G8B8_SNORM: + case VULKAN_FORMAT_R8G8B8_USCALED: + case VULKAN_FORMAT_R8G8B8_SSCALED: + case VULKAN_FORMAT_R8G8B8_UINT: + case VULKAN_FORMAT_R8G8B8_SINT: + case VULKAN_FORMAT_R8G8B8_SRGB: + case VULKAN_FORMAT_B8G8R8_UNORM: + case VULKAN_FORMAT_B8G8R8_SNORM: + case VULKAN_FORMAT_B8G8R8_USCALED: + case VULKAN_FORMAT_B8G8R8_SSCALED: + case VULKAN_FORMAT_B8G8R8_UINT: + case VULKAN_FORMAT_B8G8R8_SINT: + case VULKAN_FORMAT_B8G8R8_SRGB: + case VULKAN_FORMAT_R8G8B8A8_UNORM: + case VULKAN_FORMAT_R8G8B8A8_SNORM: + case VULKAN_FORMAT_R8G8B8A8_USCALED: + case VULKAN_FORMAT_R8G8B8A8_SSCALED: + case VULKAN_FORMAT_R8G8B8A8_SRGB: + case VULKAN_FORMAT_B8G8R8A8_UNORM: + case VULKAN_FORMAT_B8G8R8A8_SNORM: + case VULKAN_FORMAT_B8G8R8A8_USCALED: + case VULKAN_FORMAT_B8G8R8A8_SSCALED: + case VULKAN_FORMAT_B8G8R8A8_UINT: + case VULKAN_FORMAT_B8G8R8A8_SINT: + case VULKAN_FORMAT_B8G8R8A8_SRGB: + case VULKAN_FORMAT_A8B8G8R8_UNORM_PACK32: + case VULKAN_FORMAT_A8B8G8R8_SNORM_PACK32: + case VULKAN_FORMAT_A8B8G8R8_USCALED_PACK32: + case VULKAN_FORMAT_A8B8G8R8_SSCALED_PACK32: + case VULKAN_FORMAT_A8B8G8R8_UINT_PACK32: + case VULKAN_FORMAT_A8B8G8R8_SINT_PACK32: + case VULKAN_FORMAT_A8B8G8R8_SRGB_PACK32: + case VULKAN_FORMAT_A2R10G10B10_UNORM_PACK32: + case VULKAN_FORMAT_A2R10G10B10_SNORM_PACK32: + case VULKAN_FORMAT_A2R10G10B10_USCALED_PACK32: + case VULKAN_FORMAT_A2R10G10B10_SSCALED_PACK32: + case VULKAN_FORMAT_A2R10G10B10_UINT_PACK32: + case VULKAN_FORMAT_A2R10G10B10_SINT_PACK32: + case VULKAN_FORMAT_A2B10G10R10_UNORM_PACK32: + case VULKAN_FORMAT_A2B10G10R10_SNORM_PACK32: + case VULKAN_FORMAT_A2B10G10R10_USCALED_PACK32: + case VULKAN_FORMAT_A2B10G10R10_SSCALED_PACK32: + case VULKAN_FORMAT_A2B10G10R10_UINT_PACK32: + case VULKAN_FORMAT_A2B10G10R10_SINT_PACK32: + case VULKAN_FORMAT_R16_UNORM: + case VULKAN_FORMAT_R16_SNORM: + case VULKAN_FORMAT_R16_USCALED: + case VULKAN_FORMAT_R16_SSCALED: + case VULKAN_FORMAT_R16_SFLOAT: + case VULKAN_FORMAT_R16G16_UNORM: + case VULKAN_FORMAT_R16G16_SNORM: + case VULKAN_FORMAT_R16G16_USCALED: + case VULKAN_FORMAT_R16G16_SSCALED: + case VULKAN_FORMAT_R16G16_SFLOAT: + case VULKAN_FORMAT_R16G16B16_UNORM: + case VULKAN_FORMAT_R16G16B16_SNORM: + case VULKAN_FORMAT_R16G16B16_USCALED: + case VULKAN_FORMAT_R16G16B16_SSCALED: + case VULKAN_FORMAT_R16G16B16_UINT: + case VULKAN_FORMAT_R16G16B16_SINT: + case VULKAN_FORMAT_R16G16B16_SFLOAT: + case VULKAN_FORMAT_R16G16B16A16_UNORM: + case VULKAN_FORMAT_R16G16B16A16_SNORM: + case VULKAN_FORMAT_R16G16B16A16_USCALED: + case VULKAN_FORMAT_R16G16B16A16_SSCALED: + case VULKAN_FORMAT_R16G16B16A16_SFLOAT: + case VULKAN_FORMAT_R32G32B32_UINT: + case VULKAN_FORMAT_R32G32B32_SINT: + case VULKAN_FORMAT_R32G32B32_SFLOAT: + case VULKAN_FORMAT_R64_UINT: + case VULKAN_FORMAT_R64_SINT: + case VULKAN_FORMAT_R64_SFLOAT: + case VULKAN_FORMAT_R64G64_UINT: + case VULKAN_FORMAT_R64G64_SINT: + case VULKAN_FORMAT_R64G64_SFLOAT: + case VULKAN_FORMAT_R64G64B64_UINT: + case VULKAN_FORMAT_R64G64B64_SINT: + case VULKAN_FORMAT_R64G64B64_SFLOAT: + case VULKAN_FORMAT_R64G64B64A64_UINT: + case VULKAN_FORMAT_R64G64B64A64_SINT: + case VULKAN_FORMAT_R64G64B64A64_SFLOAT: + case VULKAN_FORMAT_B10G11R11_UFLOAT_PACK32: + case VULKAN_FORMAT_E5B9G9R9_UFLOAT_PACK32: + case VULKAN_FORMAT_D16_UNORM: + case VULKAN_FORMAT_X8_D24_UNORM_PACK32: + case VULKAN_FORMAT_D32_SFLOAT: + case VULKAN_FORMAT_S8_UINT: + case VULKAN_FORMAT_D16_UNORM_S8_UINT: + case VULKAN_FORMAT_D24_UNORM_S8_UINT: + case VULKAN_FORMAT_D32_SFLOAT_S8_UINT: + case VULKAN_FORMAT_BC1_RGB_UNORM_BLOCK: + case VULKAN_FORMAT_BC1_RGB_SRGB_BLOCK: + case VULKAN_FORMAT_BC1_RGBA_UNORM_BLOCK: + case VULKAN_FORMAT_BC1_RGBA_SRGB_BLOCK: + case VULKAN_FORMAT_BC2_UNORM_BLOCK: + case VULKAN_FORMAT_BC2_SRGB_BLOCK: + case VULKAN_FORMAT_BC3_UNORM_BLOCK: + case VULKAN_FORMAT_BC3_SRGB_BLOCK: + case VULKAN_FORMAT_BC4_UNORM_BLOCK: + case VULKAN_FORMAT_BC4_SNORM_BLOCK: + case VULKAN_FORMAT_BC5_UNORM_BLOCK: + case VULKAN_FORMAT_BC5_SNORM_BLOCK: + case VULKAN_FORMAT_BC6H_UFLOAT_BLOCK: + case VULKAN_FORMAT_BC6H_SFLOAT_BLOCK: + case VULKAN_FORMAT_BC7_UNORM_BLOCK: + case VULKAN_FORMAT_BC7_SRGB_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: + case VULKAN_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: + case VULKAN_FORMAT_EAC_R11_UNORM_BLOCK: + case VULKAN_FORMAT_EAC_R11_SNORM_BLOCK: + case VULKAN_FORMAT_EAC_R11G11_UNORM_BLOCK: + case VULKAN_FORMAT_EAC_R11G11_SNORM_BLOCK: + case VULKAN_FORMAT_ASTC_4x4_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_4x4_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_5x4_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_5x4_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_5x5_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_5x5_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_6x5_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_6x5_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_6x6_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_6x6_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_8x5_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_8x5_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_8x6_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_8x6_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_8x8_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_8x8_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_10x5_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_10x5_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_10x6_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_10x6_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_10x8_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_10x8_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_10x10_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_10x10_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_12x10_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_12x10_SRGB_BLOCK: + case VULKAN_FORMAT_ASTC_12x12_UNORM_BLOCK: + case VULKAN_FORMAT_ASTC_12x12_SRGB_BLOCK: + ASSERT(0); + std::cout << "Unsupport texture format"; + } + } + + return formatList; +} + +uint32_t getVulkanFormatElementSize(VulkanFormat format) +{ + switch (format) + { + case VULKAN_FORMAT_R8_UINT: return uint32_t(1); + case VULKAN_FORMAT_R8_SINT: return uint32_t(1); + case VULKAN_FORMAT_R8G8_UINT: return uint32_t(2); + case VULKAN_FORMAT_R8G8_SINT: return uint32_t(2); + case VULKAN_FORMAT_R8G8B8A8_UINT: return uint32_t(4); + case VULKAN_FORMAT_R8G8B8A8_SINT: return uint32_t(4); + case VULKAN_FORMAT_R16_UINT: return uint32_t(2); + case VULKAN_FORMAT_R16_SINT: return uint32_t(2); + case VULKAN_FORMAT_R16G16_UINT: return uint32_t(4); + case VULKAN_FORMAT_R16G16_SINT: return uint32_t(4); + case VULKAN_FORMAT_R16G16B16A16_UINT: return uint32_t(8); + case VULKAN_FORMAT_R16G16B16A16_SINT: return uint32_t(8); + case VULKAN_FORMAT_R32_UINT: return uint32_t(4); + case VULKAN_FORMAT_R32_SINT: return uint32_t(4); + case VULKAN_FORMAT_R32_SFLOAT: return uint32_t(4); + case VULKAN_FORMAT_R32G32_UINT: return uint32_t(8); + case VULKAN_FORMAT_R32G32_SINT: return uint32_t(8); + case VULKAN_FORMAT_R32G32_SFLOAT: return uint32_t(8); + case VULKAN_FORMAT_R32G32B32A32_UINT: return uint32_t(16); + case VULKAN_FORMAT_R32G32B32A32_SINT: return uint32_t(16); + case VULKAN_FORMAT_R32G32B32A32_SFLOAT: return uint32_t(16); + default: ASSERT(0); std::cout << "Unknown format"; + } + + return uint32_t(0); +} + +const char *getVulkanFormatGLSLFormat(VulkanFormat format) +{ + switch (format) + { + case VULKAN_FORMAT_R8_UINT: return "r8ui"; + case VULKAN_FORMAT_R8_SINT: return "r8i"; + case VULKAN_FORMAT_R8G8_UINT: return "rg8ui"; + case VULKAN_FORMAT_R8G8_SINT: return "rg8i"; + case VULKAN_FORMAT_R8G8B8A8_UINT: return "rgba8ui"; + case VULKAN_FORMAT_R8G8B8A8_SINT: return "rgba8i"; + case VULKAN_FORMAT_R16_UINT: return "r16ui"; + case VULKAN_FORMAT_R16_SINT: return "r16i"; + case VULKAN_FORMAT_R16G16_UINT: return "rg16ui"; + case VULKAN_FORMAT_R16G16_SINT: return "rg16i"; + case VULKAN_FORMAT_R16G16B16A16_UINT: return "rgba16ui"; + case VULKAN_FORMAT_R16G16B16A16_SINT: return "rgba16i"; + case VULKAN_FORMAT_R32_UINT: return "r32ui"; + case VULKAN_FORMAT_R32_SINT: return "r32i"; + case VULKAN_FORMAT_R32_SFLOAT: return "r32f"; + case VULKAN_FORMAT_R32G32_UINT: return "rg32ui"; + case VULKAN_FORMAT_R32G32_SINT: return "rg32i"; + case VULKAN_FORMAT_R32G32_SFLOAT: return "rg32f"; + case VULKAN_FORMAT_R32G32B32A32_UINT: return "rgba32ui"; + case VULKAN_FORMAT_R32G32B32A32_SINT: return "rgba32i"; + case VULKAN_FORMAT_R32G32B32A32_SFLOAT: return "rgba32f"; + default: ASSERT(0); std::cout << "Unknown format"; + } + + return (const char *)size_t(0); +} + +std::ostream &operator<<(std::ostream &os, + VulkanMemoryTypeProperty memoryTypeProperty) +{ + switch (memoryTypeProperty) + { + case VULKAN_MEMORY_TYPE_PROPERTY_NONE: return os << "None"; + case VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL: + return os << "Device local"; + case VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT: + return os << "Host visible and coherent"; + case VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_CACHED: + return os << "Host visible and cached"; + case VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_CACHED_COHERENT: + return os << "Host visible, cached and coherent"; + case VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_COHERENT: + return os << "Device local, Host visible and coherent"; + case VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_CACHED: + return os << "Device local, Host visible and cached"; + case VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_CACHED_COHERENT: + return os << "Device local, Host visible, cached and coherent"; + } + + return os; +} + +std::ostream & +operator<<(std::ostream &os, + VulkanExternalMemoryHandleType externalMemoryHandleType) +{ + switch (externalMemoryHandleType) + { + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE: return os << "None"; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: + return os << "Opaque file descriptor"; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT: + return os << "Opaque NT handle"; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT: + return os << "Opaque D3DKMT handle"; + case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT: + return os << "Opaque NT and D3DKMT handle"; + } + + return os; +} + +std::ostream & +operator<<(std::ostream &os, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) +{ + switch (externalSemaphoreHandleType) + { + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE: return os << "None"; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD: + return os << "Opaque file descriptor"; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT: + return os << "Opaque NT handle"; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT: + return os << "Opaque D3DKMT handle"; + case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT: + return os << "Opaque NT and D3DKMT handle"; + } + + return os; +} + +std::ostream &operator<<(std::ostream &os, VulkanFormat format) +{ + switch (format) + { + case VULKAN_FORMAT_R8_UINT: return os << "R8_UINT"; + case VULKAN_FORMAT_R8_SINT: return os << "R8_SINT"; + case VULKAN_FORMAT_R8G8_UINT: return os << "R8G8_UINT"; + case VULKAN_FORMAT_R8G8_SINT: return os << "R8G8_SINT"; + case VULKAN_FORMAT_R8G8B8A8_UINT: return os << "R8G8B8A8_UINT"; + case VULKAN_FORMAT_R8G8B8A8_SINT: return os << "R8G8B8A8_SINT"; + case VULKAN_FORMAT_R16_UINT: return os << "R16_UINT"; + case VULKAN_FORMAT_R16_SINT: return os << "R16_SINT"; + case VULKAN_FORMAT_R16G16_UINT: return os << "R16G16_UINT"; + case VULKAN_FORMAT_R16G16_SINT: return os << "R16G16_SINT"; + case VULKAN_FORMAT_R16G16B16A16_UINT: return os << "R16G16B16A16_UINT"; + case VULKAN_FORMAT_R16G16B16A16_SINT: return os << "R16G16B16A16_SINT"; + case VULKAN_FORMAT_R32_UINT: return os << "R32_UINT"; + case VULKAN_FORMAT_R32_SINT: return os << "R32_SINT"; + case VULKAN_FORMAT_R32_SFLOAT: return os << "R32_SFLOAT"; + case VULKAN_FORMAT_R32G32_UINT: return os << "R32G32_UINT"; + case VULKAN_FORMAT_R32G32_SINT: return os << "R32G32_SINT"; + case VULKAN_FORMAT_R32G32_SFLOAT: return os << "R32G32_SFLOAT"; + case VULKAN_FORMAT_R32G32B32A32_UINT: return os << "R32G32B32A32_UINT"; + case VULKAN_FORMAT_R32G32B32A32_SINT: return os << "R32G32B32A32_SINT"; + case VULKAN_FORMAT_R32G32B32A32_SFLOAT: + return os << "R32G32B32A32_SFLOAT"; + break; + default: ASSERT(0); std::cout << "Unknown format"; + } + + return os; +} + +static char *findFilePath(const std::string filename) +{ + const char *searchPath[] = { + "./", // Same dir + "./shaders/", // In shaders folder in same dir + "../test_conformance/vulkan/shaders/" // In src folder + }; + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) + { + std::string path(searchPath[i]); + + path.append(filename); + FILE *fp; + fp = fopen(path.c_str(), "rb"); + + if (fp != NULL) + { + fclose(fp); + // File found + char *file_path = (char *)(malloc(path.length() + 1)); + strncpy(file_path, path.c_str(), path.length() + 1); + return file_path; + } + if (fp) + { + fclose(fp); + } + } + // File not found + return 0; +} + +std::vector<char> readFile(const std::string &filename) +{ + char *file_path = findFilePath(filename); + + std::ifstream file(file_path, std::ios::ate | std::ios::binary); + + if (!file.is_open()) + { + throw std::runtime_error("failed to open shader spv file!\n"); + } + size_t fileSize = (size_t)file.tellg(); + std::vector<char> buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + printf("filesize is %d", fileSize); + return buffer; +} diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp new file mode 100644 index 00000000..04f5a594 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_utility.hpp @@ -0,0 +1,70 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _vulkan_utility_hpp_ +#define _vulkan_utility_hpp_ + +#include "vulkan_wrapper_types.hpp" +#include <vector> +#include <ostream> +#include <string.h> +#include <map> +#include "../../../test_common/harness/testHarness.h" + +#define STRING_(str) #str +#define STRING(str) STRING_(str) + +#define ROUND_UP(n, multiple) \ + (((n) + (multiple)-1) - ((((n) + (multiple)-1)) % (multiple))) + +const VulkanInstance& getVulkanInstance(); +const VulkanPhysicalDevice& getVulkanPhysicalDevice(); +const VulkanQueueFamily& +getVulkanQueueFamily(uint32_t queueFlags = VULKAN_QUEUE_FLAG_MASK_ALL); +const VulkanMemoryType& +getVulkanMemoryType(const VulkanDevice& device, + VulkanMemoryTypeProperty memoryTypeProperty); +bool checkVkSupport(); +const VulkanQueueFamilyList& getEmptyVulkanQueueFamilyList(); +const VulkanDescriptorSetLayoutList& getEmptyVulkanDescriptorSetLayoutList(); +const VulkanQueueFamilyToQueueCountMap& +getDefaultVulkanQueueFamilyToQueueCountMap(); +const std::vector<VulkanExternalMemoryHandleType> +getSupportedVulkanExternalMemoryHandleTypeList(); +const std::vector<VulkanExternalSemaphoreHandleType> +getSupportedVulkanExternalSemaphoreHandleTypeList(); +const std::vector<VulkanFormat> getSupportedVulkanFormatList(); + +uint32_t getVulkanFormatElementSize(VulkanFormat format); +const char* getVulkanFormatGLSLFormat(VulkanFormat format); +const char* getVulkanFormatGLSLTypePrefix(VulkanFormat format); + +std::string prepareVulkanShader( + std::string shaderCode, + const std::map<std::string, std::string>& patternToSubstituteMap); + +std::ostream& operator<<(std::ostream& os, + VulkanMemoryTypeProperty memoryTypeProperty); +std::ostream& +operator<<(std::ostream& os, + VulkanExternalMemoryHandleType externalMemoryHandleType); +std::ostream& +operator<<(std::ostream& os, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType); +std::ostream& operator<<(std::ostream& os, VulkanFormat format); + +std::vector<char> readFile(const std::string& filename); +#endif // _vulkan_utility_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp new file mode 100644 index 00000000..6209a747 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.cpp @@ -0,0 +1,2072 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifdef _WIN32 +#define NOMINMAX +#include <Windows.h> +#include <dxgi1_2.h> +#include <aclapi.h> +#endif +#include <vulkan/vulkan.h> +#include "vulkan_wrapper.hpp" +#if defined(__linux__) && !defined(__ANDROID__) +#include <gnu/libc-version.h> +#include <dlfcn.h> +#elif defined(__ANDROID__) +#include <dlfcn.h> +#endif +#if defined _WIN32 +#define LoadFunction GetProcAddress +#elif defined __linux +#define LoadFunction dlsym +#endif + +extern "C" { +#define VK_FUNC_DECL(name) PFN_##name _##name = NULL; +VK_FUNC_LIST +#if defined(_WIN32) || defined(_WIN64) +VK_WINDOWS_FUNC_LIST +#endif +#undef VK_FUNC_DECL +} + +#define WAIVED 2 +#define HANDLE_ERROR -1 + +#define CHECK_VK(call) \ + if (call != VK_SUCCESS) return call; +/////////////////////////////////// +// VulkanInstance implementation // +/////////////////////////////////// + +VulkanInstance::VulkanInstance(const VulkanInstance &instance) + : m_vkInstance(instance.m_vkInstance), + m_physicalDeviceList(instance.m_physicalDeviceList) +{} + +VulkanInstance::VulkanInstance(): m_vkInstance(VK_NULL_HANDLE) +{ +#if defined(__linux__) && !defined(__ANDROID__) + char *glibcVersion = strdup(gnu_get_libc_version()); + int majNum = (int)atoi(strtok(glibcVersion, ".")); + int minNum = (int)atoi(strtok(NULL, ".")); + free(glibcVersion); + if ((majNum < 2) || (majNum == 2 && minNum < 17)) + { + // WAIVE_TEST() << "Insufficient GLIBC version. Test waived!"; + } +#endif + +#if defined(_WIN32) || defined(_WIN64) + const char *vulkanLoaderLibraryName = "vulkan-1.dll"; +#elif defined(__linux__) + const char *vulkanLoaderLibraryName = "libvulkan.so.1"; +#endif +#ifdef _WIN32 + HINSTANCE hDLL; + hDLL = LoadLibrary(vulkanLoaderLibraryName); + if (hDLL == NULL) + { + throw std::runtime_error("LoadLibrary failed!"); + } + vkGetInstanceProcAddr = + (PFN_vkGetInstanceProcAddr)LoadFunction(hDLL, "vkGetInstanceProcAddr"); +#else +#if !defined(__APPLE__) + void *handle; + handle = dlopen(vulkanLoaderLibraryName, RTLD_LAZY); + if (!handle) + { + fputs(dlerror(), stderr); + throw std::runtime_error("dlopen failed !!!"); + } + vkGetInstanceProcAddr = (PFN_vkGetInstanceProcAddr)LoadFunction( + handle, "vkGetInstanceProcAddr"); +#endif +#endif + if ((unsigned long long)vkGetInstanceProcAddr == (unsigned long long)NULL) + { + throw std::runtime_error("vkGetInstanceProcAddr() not found!"); + } +#define VK_GET_NULL_INSTANCE_PROC_ADDR(name) \ + _##name = (PFN_##name)vkGetInstanceProcAddr(NULL, #name); + + if ((unsigned long long)vkGetInstanceProcAddr == (unsigned long long)NULL) + { + throw std::runtime_error("Couldn't obtain address for function"); + } + VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceExtensionProperties); + uint32_t instanceExtensionPropertiesCount; + VkResult vkStatus = VK_SUCCESS; + vkStatus = vkEnumerateInstanceExtensionProperties( + NULL, &instanceExtensionPropertiesCount, NULL); + // Something went wrong in vulkan initialization (most likely incompatible + // device/driver combination) + if (vkStatus == VK_ERROR_INCOMPATIBLE_DRIVER) + { + throw std::runtime_error( + "Waiving vulkan test because " + "vkEnumerateInstanceExtensionProperties failed."); + // return WAIVED; + } + + VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceVersion); + VK_GET_NULL_INSTANCE_PROC_ADDR(vkEnumerateInstanceLayerProperties); + VK_GET_NULL_INSTANCE_PROC_ADDR(vkCreateInstance); +#undef VK_GET_NULL_INSTANCE_PROC_ADDR + + VkApplicationInfo vkApplicationInfo = {}; + vkApplicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + vkApplicationInfo.pNext = NULL; + vkApplicationInfo.pApplicationName = "Default app"; + vkApplicationInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + vkApplicationInfo.pEngineName = "No engine"; + vkApplicationInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + vkApplicationInfo.apiVersion = VK_API_VERSION_1_0; + + std::vector<const char *> enabledExtensionNameList; + enabledExtensionNameList.push_back( + VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + + std::vector<VkExtensionProperties> vkExtensionPropertiesList( + instanceExtensionPropertiesCount); + vkEnumerateInstanceExtensionProperties(NULL, + &instanceExtensionPropertiesCount, + vkExtensionPropertiesList.data()); + + for (size_t eenIdx = 0; eenIdx < enabledExtensionNameList.size(); eenIdx++) + { + bool isSupported = false; + for (size_t epIdx = 0; epIdx < vkExtensionPropertiesList.size(); + epIdx++) + { + if (!strcmp(enabledExtensionNameList[eenIdx], + vkExtensionPropertiesList[epIdx].extensionName)) + { + isSupported = true; + break; + } + } + if (!isSupported) + { + return; + } + } + + VkInstanceCreateInfo vkInstanceCreateInfo = {}; + vkInstanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + vkInstanceCreateInfo.pNext = NULL; + vkInstanceCreateInfo.flags = 0; + vkInstanceCreateInfo.pApplicationInfo = &vkApplicationInfo; + vkInstanceCreateInfo.enabledLayerCount = 0; + vkInstanceCreateInfo.ppEnabledLayerNames = NULL; + vkInstanceCreateInfo.enabledExtensionCount = + (uint32_t)enabledExtensionNameList.size(); + vkInstanceCreateInfo.ppEnabledExtensionNames = + enabledExtensionNameList.data(); + + vkCreateInstance(&vkInstanceCreateInfo, NULL, &m_vkInstance); + +#define VK_FUNC_DECL(name) \ + _##name = (PFN_##name)vkGetInstanceProcAddr(m_vkInstance, #name); \ + // ASSERT_NEQ((unsigned long long)name, 0ULL) << "Couldn't obtain address + // for function" << #name; + + VK_FUNC_LIST +#if defined(_WIN32) || defined(_WIN64) + VK_WINDOWS_FUNC_LIST +#endif +#undef VK_FUNC_DECL + + uint32_t physicalDeviceCount = 0; + vkEnumeratePhysicalDevices(m_vkInstance, &physicalDeviceCount, NULL); + // CHECK_NEQ(physicalDeviceCount, uint32_t(0)); + + if (physicalDeviceCount == uint32_t(0)) + { + std::cout << "failed to find GPUs with Vulkan support!\n"; + return; + } + + std::vector<VkPhysicalDevice> vkPhysicalDeviceList(physicalDeviceCount, + VK_NULL_HANDLE); + vkEnumeratePhysicalDevices(m_vkInstance, &physicalDeviceCount, + vkPhysicalDeviceList.data()); + + for (size_t ppdIdx = 0; ppdIdx < vkPhysicalDeviceList.size(); ppdIdx++) + { + VulkanPhysicalDevice *physicalDevice = + new VulkanPhysicalDevice(vkPhysicalDeviceList[ppdIdx]); + m_physicalDeviceList.add(*physicalDevice); + } +} + +VulkanInstance::~VulkanInstance() +{ + for (size_t pdIdx = 0; pdIdx < m_physicalDeviceList.size(); pdIdx++) + { + const VulkanPhysicalDevice &physicalDevice = + m_physicalDeviceList[pdIdx]; + delete &physicalDevice; + } + if (m_vkInstance) + { + vkDestroyInstance(m_vkInstance, NULL); + } +} + +const VulkanPhysicalDeviceList &VulkanInstance::getPhysicalDeviceList() const +{ + return m_physicalDeviceList; +} + +VulkanInstance::operator VkInstance() const { return m_vkInstance; } + +///////////////////////////////////////// +// VulkanPhysicalDevice implementation // +///////////////////////////////////////// + +VulkanPhysicalDevice::VulkanPhysicalDevice( + const VulkanPhysicalDevice &physicalDevice) + : m_vkPhysicalDevice(physicalDevice.m_vkPhysicalDevice), + m_vkPhysicalDeviceProperties(physicalDevice.m_vkPhysicalDeviceProperties), + m_vkDeviceNodeMask(physicalDevice.m_vkDeviceNodeMask), + m_vkPhysicalDeviceFeatures(physicalDevice.m_vkPhysicalDeviceFeatures), + m_vkPhysicalDeviceMemoryProperties( + physicalDevice.m_vkPhysicalDeviceMemoryProperties), + m_queueFamilyList(physicalDevice.m_queueFamilyList) +{ + memcpy(m_vkDeviceUUID, physicalDevice.m_vkDeviceUUID, VK_UUID_SIZE); +} + +VulkanPhysicalDevice::VulkanPhysicalDevice(VkPhysicalDevice vkPhysicalDevice) + : m_vkPhysicalDevice(vkPhysicalDevice) +{ + if (m_vkPhysicalDevice == (VkPhysicalDevice)VK_NULL_HANDLE) + { + throw std::runtime_error("failed to find a suitable GPU!"); + } + + vkGetPhysicalDeviceProperties(m_vkPhysicalDevice, + &m_vkPhysicalDeviceProperties); + vkGetPhysicalDeviceFeatures(m_vkPhysicalDevice, + &m_vkPhysicalDeviceFeatures); + + VkPhysicalDeviceIDPropertiesKHR vkPhysicalDeviceIDPropertiesKHR = {}; + vkPhysicalDeviceIDPropertiesKHR.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR; + vkPhysicalDeviceIDPropertiesKHR.pNext = NULL; + + VkPhysicalDeviceProperties2KHR vkPhysicalDeviceProperties2KHR = {}; + vkPhysicalDeviceProperties2KHR.sType = + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; + vkPhysicalDeviceProperties2KHR.pNext = &vkPhysicalDeviceIDPropertiesKHR; + + vkGetPhysicalDeviceProperties2KHR(m_vkPhysicalDevice, + &vkPhysicalDeviceProperties2KHR); + + memcpy(m_vkDeviceUUID, vkPhysicalDeviceIDPropertiesKHR.deviceUUID, + sizeof(m_vkDeviceUUID)); + memcpy(m_vkDeviceLUID, vkPhysicalDeviceIDPropertiesKHR.deviceLUID, + sizeof(m_vkDeviceLUID)); + m_vkDeviceNodeMask = vkPhysicalDeviceIDPropertiesKHR.deviceNodeMask; + + uint32_t queueFamilyCount = 0; + vkGetPhysicalDeviceQueueFamilyProperties(m_vkPhysicalDevice, + &queueFamilyCount, NULL); + + std::vector<VkQueueFamilyProperties> vkQueueFamilyPropertiesList( + queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties( + m_vkPhysicalDevice, &queueFamilyCount, + vkQueueFamilyPropertiesList.data()); + + for (size_t qfpIdx = 0; qfpIdx < vkQueueFamilyPropertiesList.size(); + qfpIdx++) + { + VulkanQueueFamily *queueFamily = new VulkanQueueFamily( + uint32_t(qfpIdx), vkQueueFamilyPropertiesList[qfpIdx]); + m_queueFamilyList.add(*queueFamily); + } + + vkGetPhysicalDeviceMemoryProperties(m_vkPhysicalDevice, + &m_vkPhysicalDeviceMemoryProperties); + + for (uint32_t mhIdx = 0; + mhIdx < m_vkPhysicalDeviceMemoryProperties.memoryHeapCount; mhIdx++) + { + VulkanMemoryHeap *memoryHeap = new VulkanMemoryHeap( + mhIdx, m_vkPhysicalDeviceMemoryProperties.memoryHeaps[mhIdx].size, + (VulkanMemoryHeapFlag)m_vkPhysicalDeviceMemoryProperties + .memoryHeaps[mhIdx] + .flags); + m_memoryHeapList.add(*memoryHeap); + } + + for (uint32_t mtIdx = 0; + mtIdx < m_vkPhysicalDeviceMemoryProperties.memoryTypeCount; mtIdx++) + { + const VulkanMemoryHeap &memoryHeap = m_memoryHeapList + [m_vkPhysicalDeviceMemoryProperties.memoryTypes[mtIdx].heapIndex]; + VulkanMemoryType *memoryType = new VulkanMemoryType( + mtIdx, + (VulkanMemoryTypeProperty)m_vkPhysicalDeviceMemoryProperties + .memoryTypes[mtIdx] + .propertyFlags, + memoryHeap); + m_memoryTypeList.add(*memoryType); + } +} + +VulkanPhysicalDevice::~VulkanPhysicalDevice() +{ + for (size_t mtIdx = 0; mtIdx < m_memoryTypeList.size(); mtIdx++) + { + const VulkanMemoryType &memoryType = m_memoryTypeList[mtIdx]; + delete &memoryType; + } + + for (size_t mhIdx = 0; mhIdx < m_memoryHeapList.size(); mhIdx++) + { + const VulkanMemoryHeap &memoryHeap = m_memoryHeapList[mhIdx]; + delete &memoryHeap; + } + + for (size_t qfIdx = 0; qfIdx < m_queueFamilyList.size(); qfIdx++) + { + const VulkanQueueFamily &queueFamily = m_queueFamilyList[qfIdx]; + delete &queueFamily; + } +} + + +const VulkanQueueFamilyList &VulkanPhysicalDevice::getQueueFamilyList() const +{ + return m_queueFamilyList; +} + +const VulkanMemoryHeapList &VulkanPhysicalDevice::getMemoryHeapList() const +{ + return m_memoryHeapList; +} + +const VulkanMemoryTypeList &VulkanPhysicalDevice::getMemoryTypeList() const +{ + return m_memoryTypeList; +} + +const uint8_t *VulkanPhysicalDevice::getUUID() const { return m_vkDeviceUUID; } + +const uint8_t *VulkanPhysicalDevice::getLUID() const { return m_vkDeviceLUID; } + +uint32_t VulkanPhysicalDevice::getNodeMask() const +{ + return m_vkDeviceNodeMask; +} + +VulkanPhysicalDevice::operator VkPhysicalDevice() const +{ + return m_vkPhysicalDevice; +} + +bool operator<(const VulkanQueueFamily &queueFamilyA, + const VulkanQueueFamily &queueFamilyB) +{ + return (uint32_t)queueFamilyA < (uint32_t)queueFamilyB; +} + +///////////////////////////////////// +// VulkanMemoryHeap implementation // +///////////////////////////////////// + +VulkanMemoryHeap::VulkanMemoryHeap(const VulkanMemoryHeap &memoryHeap) + : m_memoryHeapIndex(memoryHeap.m_memoryHeapIndex), + m_size(memoryHeap.m_size), m_memoryHeapFlag(memoryHeap.m_memoryHeapFlag) +{} + +VulkanMemoryHeap::VulkanMemoryHeap(uint32_t memoryHeapIndex, uint64_t size, + VulkanMemoryHeapFlag memoryHeapFlag) + : m_memoryHeapIndex(memoryHeapIndex), m_size(size), + m_memoryHeapFlag(memoryHeapFlag) +{} + +VulkanMemoryHeap::~VulkanMemoryHeap() {} + +uint64_t VulkanMemoryHeap::getSize() const { return m_size; } + + +VulkanMemoryHeapFlag VulkanMemoryHeap::getMemoryHeapFlag() const +{ + return m_memoryHeapFlag; +} + +VulkanMemoryHeap::operator uint32_t() const { return m_memoryHeapIndex; } + +///////////////////////////////////// +// VulkanMemoryType implementation // +///////////////////////////////////// + +VulkanMemoryType::VulkanMemoryType(const VulkanMemoryType &memoryType) + : m_memoryTypeIndex(memoryType.m_memoryTypeIndex), + m_memoryTypeProperty(memoryType.m_memoryTypeProperty), + m_memoryHeap(memoryType.m_memoryHeap) +{} + +VulkanMemoryType::VulkanMemoryType(uint32_t memoryTypeIndex, + VulkanMemoryTypeProperty memoryTypeProperty, + const VulkanMemoryHeap &memoryHeap) + : m_memoryTypeIndex(memoryTypeIndex), + m_memoryTypeProperty(memoryTypeProperty), m_memoryHeap(memoryHeap) +{} + +VulkanMemoryType::~VulkanMemoryType() {} + +VulkanMemoryTypeProperty VulkanMemoryType::getMemoryTypeProperty() const +{ + return m_memoryTypeProperty; +} + +const VulkanMemoryHeap &VulkanMemoryType::getMemoryHeap() const +{ + return m_memoryHeap; +} + +VulkanMemoryType::operator uint32_t() const { return m_memoryTypeIndex; } + +////////////////////////////////////// +// VulkanQueueFamily implementation // +////////////////////////////////////// + +VulkanQueueFamily::VulkanQueueFamily(const VulkanQueueFamily &queueFamily) + : m_queueFamilyIndex(queueFamily.m_queueFamilyIndex), + m_vkQueueFamilyProperties(queueFamily.m_vkQueueFamilyProperties) +{} + +VulkanQueueFamily::VulkanQueueFamily( + uint32_t queueFamilyIndex, VkQueueFamilyProperties vkQueueFamilyProperties) + : m_queueFamilyIndex(queueFamilyIndex), + m_vkQueueFamilyProperties(vkQueueFamilyProperties) +{} + +VulkanQueueFamily::~VulkanQueueFamily() {} + +uint32_t VulkanQueueFamily::getQueueFlags() const +{ + return m_vkQueueFamilyProperties.queueFlags + & (uint32_t)VULKAN_QUEUE_FLAG_MASK_ALL; +} + +uint32_t VulkanQueueFamily::getQueueCount() const +{ + return m_vkQueueFamilyProperties.queueCount; +} + +VulkanQueueFamily::operator uint32_t() const { return m_queueFamilyIndex; } + +///////////////////////////////// +// VulkanDevice implementation // +///////////////////////////////// + +VulkanDevice::VulkanDevice(const VulkanDevice &device) + : m_physicalDevice(device.m_physicalDevice), m_vkDevice(device.m_vkDevice) +{} + +VulkanDevice::VulkanDevice( + const VulkanPhysicalDevice &physicalDevice, + const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap) + : m_physicalDevice(physicalDevice), m_vkDevice(NULL) +{ + uint32_t maxQueueCount = 0; + for (uint32_t qfIdx = 0; + qfIdx < (uint32_t)physicalDevice.getQueueFamilyList().size(); qfIdx++) + { + maxQueueCount = + std::max(maxQueueCount, queueFamilyToQueueCountMap[qfIdx]); + } + + std::vector<VkDeviceQueueCreateInfo> vkDeviceQueueCreateInfoList; + std::vector<float> queuePriorities(maxQueueCount); + for (uint32_t qfIdx = 0; + qfIdx < (uint32_t)physicalDevice.getQueueFamilyList().size(); qfIdx++) + { + if (queueFamilyToQueueCountMap[qfIdx]) + { + VkDeviceQueueCreateInfo vkDeviceQueueCreateInfo = {}; + vkDeviceQueueCreateInfo.sType = + VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + vkDeviceQueueCreateInfo.pNext = NULL; + vkDeviceQueueCreateInfo.flags = 0; + vkDeviceQueueCreateInfo.queueFamilyIndex = qfIdx; + vkDeviceQueueCreateInfo.queueCount = + queueFamilyToQueueCountMap[qfIdx]; + vkDeviceQueueCreateInfo.pQueuePriorities = queuePriorities.data(); + + vkDeviceQueueCreateInfoList.push_back(vkDeviceQueueCreateInfo); + } + } + + std::vector<const char *> enabledExtensionNameList; + enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); +#if defined(_WIN32) || defined(_WIN64) + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); +#else + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); + enabledExtensionNameList.push_back( + VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); +#endif + + + VkDeviceCreateInfo vkDeviceCreateInfo = {}; + vkDeviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + vkDeviceCreateInfo.pNext = NULL; + vkDeviceCreateInfo.flags = 0; + vkDeviceCreateInfo.queueCreateInfoCount = + (uint32_t)vkDeviceQueueCreateInfoList.size(); + vkDeviceCreateInfo.pQueueCreateInfos = vkDeviceQueueCreateInfoList.data(); + vkDeviceCreateInfo.enabledLayerCount = 0; + vkDeviceCreateInfo.ppEnabledLayerNames = NULL; + vkDeviceCreateInfo.enabledExtensionCount = + (uint32_t)enabledExtensionNameList.size(); + vkDeviceCreateInfo.ppEnabledExtensionNames = + enabledExtensionNameList.data(); + vkDeviceCreateInfo.pEnabledFeatures = NULL; + + vkCreateDevice(physicalDevice, &vkDeviceCreateInfo, NULL, &m_vkDevice); + + for (uint32_t qfIdx = 0; + qfIdx < (uint32_t)m_physicalDevice.getQueueFamilyList().size(); + qfIdx++) + { + VulkanQueueList *queueList = new VulkanQueueList(); + m_queueFamilyIndexToQueueListMap.insert(qfIdx, *queueList); + for (uint32_t qIdx = 0; qIdx < queueFamilyToQueueCountMap[qfIdx]; + qIdx++) + { + VkQueue vkQueue; + vkGetDeviceQueue(m_vkDevice, qfIdx, qIdx, &vkQueue); + VulkanQueue *queue = new VulkanQueue(vkQueue); + m_queueFamilyIndexToQueueListMap[qfIdx].add(*queue); + } + } +} + +VulkanDevice::~VulkanDevice() +{ + for (uint32_t qfIdx = 0; + qfIdx < (uint32_t)m_physicalDevice.getQueueFamilyList().size(); + qfIdx++) + { + for (size_t qIdx = 0; + qIdx < m_queueFamilyIndexToQueueListMap[qfIdx].size(); qIdx++) + { + VulkanQueue &queue = m_queueFamilyIndexToQueueListMap[qfIdx][qIdx]; + delete &queue; + } + VulkanQueueList &queueList = m_queueFamilyIndexToQueueListMap[qfIdx]; + delete &queueList; + } + vkDestroyDevice(m_vkDevice, NULL); +} + +const VulkanPhysicalDevice &VulkanDevice::getPhysicalDevice() const +{ + return m_physicalDevice; +} + +VulkanQueue &VulkanDevice::getQueue(const VulkanQueueFamily &queueFamily, + uint32_t queueIndex) +{ + return m_queueFamilyIndexToQueueListMap[queueFamily][queueIndex]; +} + +VulkanDevice::operator VkDevice() const { return m_vkDevice; } + +//////////////////////////////// +// VulkanQueue implementation // +//////////////////////////////// + +VulkanQueue::VulkanQueue(const VulkanQueue &queue): m_vkQueue(queue.m_vkQueue) +{} + +VulkanQueue::VulkanQueue(VkQueue vkQueue): m_vkQueue(vkQueue) {} + +VulkanQueue::~VulkanQueue() {} + +void VulkanQueue::submit(const VulkanSemaphoreList &waitSemaphoreList, + const VulkanCommandBufferList &commandBufferList, + const VulkanSemaphoreList &signalSemaphoreList) +{ + std::vector<VkPipelineStageFlags> vkPipelineStageFlagsList( + waitSemaphoreList.size(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); + + VkSubmitInfo vkSubmitInfo = {}; + vkSubmitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + vkSubmitInfo.pNext = NULL; + vkSubmitInfo.waitSemaphoreCount = (uint32_t)waitSemaphoreList.size(); + vkSubmitInfo.pWaitSemaphores = waitSemaphoreList(); + vkSubmitInfo.pWaitDstStageMask = vkPipelineStageFlagsList.data(); + vkSubmitInfo.commandBufferCount = (uint32_t)commandBufferList.size(); + vkSubmitInfo.pCommandBuffers = commandBufferList(); + vkSubmitInfo.signalSemaphoreCount = (uint32_t)signalSemaphoreList.size(); + vkSubmitInfo.pSignalSemaphores = signalSemaphoreList(); + + vkQueueSubmit(m_vkQueue, 1, &vkSubmitInfo, NULL); +} + +void VulkanQueue::submit(const VulkanSemaphore &waitSemaphore, + const VulkanCommandBuffer &commandBuffer, + const VulkanSemaphore &signalSemaphore) +{ + VulkanSemaphoreList waitSemaphoreList; + VulkanCommandBufferList commandBufferList; + VulkanSemaphoreList signalSemaphoreList; + + waitSemaphoreList.add(waitSemaphore); + commandBufferList.add(commandBuffer); + signalSemaphoreList.add(signalSemaphore); + + submit(waitSemaphoreList, commandBufferList, signalSemaphoreList); +} + +void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer, + const VulkanSemaphore &signalSemaphore) +{ + VulkanSemaphoreList waitSemaphoreList; + VulkanCommandBufferList commandBufferList; + VulkanSemaphoreList signalSemaphoreList; + + commandBufferList.add(commandBuffer); + signalSemaphoreList.add(signalSemaphore); + + submit(waitSemaphoreList, commandBufferList, signalSemaphoreList); +} + +void VulkanQueue::submit(const VulkanCommandBuffer &commandBuffer) +{ + VulkanSemaphoreList waitSemaphoreList; + VulkanCommandBufferList commandBufferList; + VulkanSemaphoreList signalSemaphoreList; + + commandBufferList.add(commandBuffer); + + submit(waitSemaphoreList, commandBufferList, signalSemaphoreList); +} + +void VulkanQueue::waitIdle() { vkQueueWaitIdle(m_vkQueue); } + +VulkanQueue::operator VkQueue() const { return m_vkQueue; } + +///////////////////////////////////////////////////// +// VulkanDescriptorSetLayoutBinding implementation // +///////////////////////////////////////////////////// + +VulkanDescriptorSetLayoutBinding::VulkanDescriptorSetLayoutBinding( + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding) + : m_vkDescriptorSetLayoutBinding( + descriptorSetLayoutBinding.m_vkDescriptorSetLayoutBinding) +{} + +VulkanDescriptorSetLayoutBinding::VulkanDescriptorSetLayoutBinding( + uint32_t binding, VulkanDescriptorType descriptorType, + uint32_t descriptorCount, VulkanShaderStage shaderStage) +{ + m_vkDescriptorSetLayoutBinding.binding = binding; + m_vkDescriptorSetLayoutBinding.descriptorType = + (VkDescriptorType)descriptorType; + m_vkDescriptorSetLayoutBinding.descriptorCount = descriptorCount; + m_vkDescriptorSetLayoutBinding.stageFlags = + (VkShaderStageFlags)(VkShaderStageFlagBits)shaderStage; + m_vkDescriptorSetLayoutBinding.pImmutableSamplers = NULL; +} + +VulkanDescriptorSetLayoutBinding::~VulkanDescriptorSetLayoutBinding() {} + +VulkanDescriptorSetLayoutBinding::operator VkDescriptorSetLayoutBinding() const +{ + return m_vkDescriptorSetLayoutBinding; +} + +////////////////////////////////////////////// +// VulkanDescriptorSetLayout implementation // +////////////////////////////////////////////// + +VulkanDescriptorSetLayout::VulkanDescriptorSetLayout( + const VulkanDescriptorSetLayout &descriptorSetLayout) + : m_device(descriptorSetLayout.m_device), + m_vkDescriptorSetLayout(descriptorSetLayout.m_vkDescriptorSetLayout) +{} + +void VulkanDescriptorSetLayout::VulkanDescriptorSetLayoutCommon( + const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList) +{ + VkDescriptorSetLayoutCreateInfo vkDescriptorSetLayoutCreateInfo = {}; + vkDescriptorSetLayoutCreateInfo.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + vkDescriptorSetLayoutCreateInfo.pNext = NULL; + vkDescriptorSetLayoutCreateInfo.flags = 0; + vkDescriptorSetLayoutCreateInfo.bindingCount = + (uint32_t)descriptorSetLayoutBindingList.size(); + vkDescriptorSetLayoutCreateInfo.pBindings = + descriptorSetLayoutBindingList(); + + vkCreateDescriptorSetLayout(m_device, &vkDescriptorSetLayoutCreateInfo, + NULL, &m_vkDescriptorSetLayout); +} + +VulkanDescriptorSetLayout::VulkanDescriptorSetLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding) + : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList; + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding); + + VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorSetLayout::VulkanDescriptorSetLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1) + : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList; + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding0); + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding1); + + VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorSetLayout::VulkanDescriptorSetLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList) + : m_device(device), m_vkDescriptorSetLayout(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorSetLayout::~VulkanDescriptorSetLayout() +{ + if (m_vkDescriptorSetLayout != VK_NULL_HANDLE) + { + vkDestroyDescriptorSetLayout(m_device, m_vkDescriptorSetLayout, NULL); + } +} + +VulkanDescriptorSetLayout::operator VkDescriptorSetLayout() const +{ + return m_vkDescriptorSetLayout; +} + +///////////////////////////////////////// +// VulkanPipelineLayout implementation // +///////////////////////////////////////// + +VulkanPipelineLayout::VulkanPipelineLayout( + const VulkanPipelineLayout &pipelineLayout) + : m_device(pipelineLayout.m_device), + m_vkPipelineLayout(pipelineLayout.m_vkPipelineLayout) +{} + +void VulkanPipelineLayout::VulkanPipelineLayoutCommon( + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList) +{ + VkPipelineLayoutCreateInfo vkPipelineLayoutCreateInfo = {}; + vkPipelineLayoutCreateInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + vkPipelineLayoutCreateInfo.pNext = NULL; + vkPipelineLayoutCreateInfo.flags = 0; + vkPipelineLayoutCreateInfo.setLayoutCount = + (uint32_t)descriptorSetLayoutList.size(); + vkPipelineLayoutCreateInfo.pSetLayouts = descriptorSetLayoutList(); + vkPipelineLayoutCreateInfo.pushConstantRangeCount = 0; + vkPipelineLayoutCreateInfo.pPushConstantRanges = NULL; + + vkCreatePipelineLayout(m_device, &vkPipelineLayoutCreateInfo, NULL, + &m_vkPipelineLayout); +} + +VulkanPipelineLayout::VulkanPipelineLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayout &descriptorSetLayout) + : m_device(device), m_vkPipelineLayout(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutList descriptorSetLayoutList; + descriptorSetLayoutList.add(descriptorSetLayout); + + VulkanPipelineLayoutCommon(descriptorSetLayoutList); +} + +VulkanPipelineLayout::VulkanPipelineLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList) + : m_device(device), m_vkPipelineLayout(VK_NULL_HANDLE) +{ + VulkanPipelineLayoutCommon(descriptorSetLayoutList); +} + +VulkanPipelineLayout::~VulkanPipelineLayout() +{ + vkDestroyPipelineLayout(m_device, m_vkPipelineLayout, NULL); +} + +VulkanPipelineLayout::operator VkPipelineLayout() const +{ + return m_vkPipelineLayout; +} + +/////////////////////////////////////// +// VulkanShaderModule implementation // +/////////////////////////////////////// + +VulkanShaderModule::VulkanShaderModule(const VulkanShaderModule &shaderModule) + : m_device(shaderModule.m_device), + m_vkShaderModule(shaderModule.m_vkShaderModule) +{} + +VulkanShaderModule::VulkanShaderModule(const VulkanDevice &device, + const std::vector<char> &code) + : m_device(device) +{ + + VkShaderModuleCreateInfo vkShaderModuleCreateInfo = {}; + vkShaderModuleCreateInfo.sType = + VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + vkShaderModuleCreateInfo.pNext = NULL; + vkShaderModuleCreateInfo.flags = 0; + vkShaderModuleCreateInfo.codeSize = code.size(); + vkShaderModuleCreateInfo.pCode = + reinterpret_cast<const uint32_t *>(code.data()); + + vkCreateShaderModule(m_device, &vkShaderModuleCreateInfo, NULL, + &m_vkShaderModule); +} + +VulkanShaderModule::~VulkanShaderModule() +{ + vkDestroyShaderModule(m_device, m_vkShaderModule, NULL); +} + +VulkanShaderModule::operator VkShaderModule() const { return m_vkShaderModule; } + +/////////////////////////////////// +// VulkanPipeline implementation // +/////////////////////////////////// + +VulkanPipeline::VulkanPipeline(const VulkanPipeline &pipeline) + : m_device(pipeline.m_device), m_vkPipeline(pipeline.m_vkPipeline) +{} + +VulkanPipeline::VulkanPipeline(const VulkanDevice &device) + : m_device(device), m_vkPipeline(VK_NULL_HANDLE) +{} + +VulkanPipeline::~VulkanPipeline() +{ + vkDestroyPipeline(m_device, m_vkPipeline, NULL); +} + +VulkanPipeline::operator VkPipeline() const { return m_vkPipeline; } + +////////////////////////////////////////// +// VulkanComputePipeline implementation // +////////////////////////////////////////// + +VulkanComputePipeline::VulkanComputePipeline( + const VulkanComputePipeline &computePipeline) + : VulkanPipeline(computePipeline) +{} + +VulkanComputePipeline::VulkanComputePipeline( + const VulkanDevice &device, const VulkanPipelineLayout &pipelineLayout, + const VulkanShaderModule &shaderModule, const std::string &entryFuncName) + : VulkanPipeline(device) +{ + VkPipelineShaderStageCreateInfo vkPipelineShaderStageCreateInfo = {}; + vkPipelineShaderStageCreateInfo.sType = + VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + vkPipelineShaderStageCreateInfo.pNext = NULL; + vkPipelineShaderStageCreateInfo.flags = 0; + vkPipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; + vkPipelineShaderStageCreateInfo.module = shaderModule; + vkPipelineShaderStageCreateInfo.pName = entryFuncName.c_str(); + vkPipelineShaderStageCreateInfo.pSpecializationInfo = NULL; + + VkComputePipelineCreateInfo vkComputePipelineCreateInfo = {}; + vkComputePipelineCreateInfo.sType = + VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + vkComputePipelineCreateInfo.pNext = NULL; + vkComputePipelineCreateInfo.flags = 0; + vkComputePipelineCreateInfo.stage = vkPipelineShaderStageCreateInfo; + vkComputePipelineCreateInfo.layout = pipelineLayout; + vkComputePipelineCreateInfo.basePipelineHandle = VK_NULL_HANDLE; + vkComputePipelineCreateInfo.basePipelineIndex = 0; + + vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, + &vkComputePipelineCreateInfo, NULL, &m_vkPipeline); +} + +VulkanComputePipeline::~VulkanComputePipeline() {} + +VulkanPipelineBindPoint VulkanComputePipeline::getPipelineBindPoint() const +{ + return VULKAN_PIPELINE_BIND_POINT_COMPUTE; +} + +///////////////////////////////////////// +// VulkanDescriptorPool implementation // +///////////////////////////////////////// + +VulkanDescriptorPool::VulkanDescriptorPool( + const VulkanDescriptorPool &descriptorPool) + : m_device(descriptorPool.m_device), + m_vkDescriptorPool(descriptorPool.m_vkDescriptorPool) +{} + +void VulkanDescriptorPool::VulkanDescriptorPoolCommon( + const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList) +{ + if (descriptorSetLayoutBindingList.size()) + { + std::map<VkDescriptorType, uint32_t> + vkDescriptorTypeToDescriptorCountMap; + + for (size_t dslbIdx = 0; + dslbIdx < descriptorSetLayoutBindingList.size(); dslbIdx++) + { + VkDescriptorSetLayoutBinding vkDescriptorSetLayoutBinding = + descriptorSetLayoutBindingList[dslbIdx]; + if (vkDescriptorTypeToDescriptorCountMap.find( + vkDescriptorSetLayoutBinding.descriptorType) + == vkDescriptorTypeToDescriptorCountMap.end()) + { + vkDescriptorTypeToDescriptorCountMap + [vkDescriptorSetLayoutBinding.descriptorType] = 1; + } + else + { + vkDescriptorTypeToDescriptorCountMap + [vkDescriptorSetLayoutBinding.descriptorType]++; + } + } + + std::vector<VkDescriptorPoolSize> vkDescriptorPoolSizeList; + std::map<VkDescriptorType, uint32_t>::iterator dtdcIt; + for (dtdcIt = vkDescriptorTypeToDescriptorCountMap.begin(); + dtdcIt != vkDescriptorTypeToDescriptorCountMap.end(); ++dtdcIt) + { + VkDescriptorPoolSize vkDescriptorPoolSize = {}; + vkDescriptorPoolSize.type = dtdcIt->first; + vkDescriptorPoolSize.descriptorCount = dtdcIt->second; + + vkDescriptorPoolSizeList.push_back(vkDescriptorPoolSize); + } + + VkDescriptorPoolCreateInfo vkDescriptorPoolCreateInfo = {}; + vkDescriptorPoolCreateInfo.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + vkDescriptorPoolCreateInfo.pNext = NULL; + vkDescriptorPoolCreateInfo.flags = + VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + vkDescriptorPoolCreateInfo.maxSets = 1; + vkDescriptorPoolCreateInfo.poolSizeCount = + (uint32_t)vkDescriptorPoolSizeList.size(); + vkDescriptorPoolCreateInfo.pPoolSizes = vkDescriptorPoolSizeList.data(); + + vkCreateDescriptorPool(m_device, &vkDescriptorPoolCreateInfo, NULL, + &m_vkDescriptorPool); + } +} + +VulkanDescriptorPool::VulkanDescriptorPool( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding) + : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList; + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding); + + VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorPool::VulkanDescriptorPool( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1) + : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE) +{ + VulkanDescriptorSetLayoutBindingList descriptorSetLayoutBindingList; + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding0); + descriptorSetLayoutBindingList.add(descriptorSetLayoutBinding1); + + VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorPool::VulkanDescriptorPool( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBindingList &descriptorSetLayoutBindingList) + : m_device(device), m_vkDescriptorPool(VK_NULL_HANDLE) +{ + VulkanDescriptorPoolCommon(descriptorSetLayoutBindingList); +} + +VulkanDescriptorPool::~VulkanDescriptorPool() +{ + if (m_vkDescriptorPool != VK_NULL_HANDLE) + { + vkDestroyDescriptorPool(m_device, m_vkDescriptorPool, NULL); + } +} + +VulkanDescriptorPool::operator VkDescriptorPool() const +{ + return m_vkDescriptorPool; +} + +//////////////////////////////////////// +// VulkanDescriptorSet implementation // +//////////////////////////////////////// + +VulkanDescriptorSet::VulkanDescriptorSet( + const VulkanDescriptorSet &descriptorSet) + : m_device(descriptorSet.m_device), + m_descriptorPool(descriptorSet.m_descriptorPool), + m_vkDescriptorSet(descriptorSet.m_vkDescriptorSet) +{} + +VulkanDescriptorSet::VulkanDescriptorSet( + const VulkanDevice &device, const VulkanDescriptorPool &descriptorPool, + const VulkanDescriptorSetLayout &descriptorSetLayout) + : m_device(device), m_descriptorPool(descriptorPool), + m_vkDescriptorSet(VK_NULL_HANDLE) +{ + VkDescriptorSetLayout vkDescriptorSetLayout = descriptorSetLayout; + + if ((VkDescriptorPool)m_descriptorPool) + { + VkDescriptorSetAllocateInfo vkDescriptorSetAllocateInfo = {}; + vkDescriptorSetAllocateInfo.sType = + VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + vkDescriptorSetAllocateInfo.pNext = NULL; + vkDescriptorSetAllocateInfo.descriptorPool = descriptorPool; + vkDescriptorSetAllocateInfo.descriptorSetCount = 1; + vkDescriptorSetAllocateInfo.pSetLayouts = &vkDescriptorSetLayout; + + vkAllocateDescriptorSets(m_device, &vkDescriptorSetAllocateInfo, + &m_vkDescriptorSet); + } +} + +VulkanDescriptorSet::~VulkanDescriptorSet() +{ + if ((VkDescriptorPool)m_descriptorPool) + { + vkFreeDescriptorSets(m_device, m_descriptorPool, 1, &m_vkDescriptorSet); + } +} + +void VulkanDescriptorSet::update(uint32_t binding, const VulkanBuffer &buffer) +{ + VkDescriptorBufferInfo vkDescriptorBufferInfo = {}; + vkDescriptorBufferInfo.buffer = buffer; + vkDescriptorBufferInfo.offset = 0; + vkDescriptorBufferInfo.range = VK_WHOLE_SIZE; + + VkWriteDescriptorSet vkWriteDescriptorSet = {}; + vkWriteDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + vkWriteDescriptorSet.pNext = NULL; + vkWriteDescriptorSet.dstSet = m_vkDescriptorSet; + vkWriteDescriptorSet.dstBinding = binding; + vkWriteDescriptorSet.dstArrayElement = 0; + vkWriteDescriptorSet.descriptorCount = 1; + vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + vkWriteDescriptorSet.pImageInfo = NULL; + vkWriteDescriptorSet.pBufferInfo = &vkDescriptorBufferInfo; + vkWriteDescriptorSet.pTexelBufferView = NULL; + + vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL); +} + +void VulkanDescriptorSet::update(uint32_t binding, + const VulkanImageView &imageView) +{ + VkDescriptorImageInfo vkDescriptorImageInfo = {}; + vkDescriptorImageInfo.sampler = VK_NULL_HANDLE; + vkDescriptorImageInfo.imageView = imageView; + vkDescriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + + VkWriteDescriptorSet vkWriteDescriptorSet = {}; + vkWriteDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + vkWriteDescriptorSet.pNext = NULL; + vkWriteDescriptorSet.dstSet = m_vkDescriptorSet; + vkWriteDescriptorSet.dstBinding = binding; + vkWriteDescriptorSet.dstArrayElement = 0; + vkWriteDescriptorSet.descriptorCount = 1; + vkWriteDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + vkWriteDescriptorSet.pImageInfo = &vkDescriptorImageInfo; + vkWriteDescriptorSet.pBufferInfo = NULL; + vkWriteDescriptorSet.pTexelBufferView = NULL; + + vkUpdateDescriptorSets(m_device, 1, &vkWriteDescriptorSet, 0, NULL); +} + +VulkanDescriptorSet::operator VkDescriptorSet() const +{ + return m_vkDescriptorSet; +} + +/////////////////////////////////// +// VulkanOffset3D implementation // +/////////////////////////////////// + +VulkanOffset3D::VulkanOffset3D(const VulkanOffset3D &offset3D) + : m_vkOffset3D(offset3D.m_vkOffset3D) +{} + +VulkanOffset3D::VulkanOffset3D(uint32_t x, uint32_t y, uint32_t z) +{ + m_vkOffset3D.x = x; + m_vkOffset3D.y = y; + m_vkOffset3D.z = z; +} + +VulkanOffset3D::~VulkanOffset3D() {} + +uint32_t VulkanOffset3D::getX() const { return m_vkOffset3D.x; } + +uint32_t VulkanOffset3D::getY() const { return m_vkOffset3D.y; } + +uint32_t VulkanOffset3D::getZ() const { return m_vkOffset3D.z; } + +VulkanOffset3D::operator VkOffset3D() const { return m_vkOffset3D; } + +/////////////////////////////////// +// VulkanExtent3D implementation // +/////////////////////////////////// + +VulkanExtent3D::VulkanExtent3D(const VulkanExtent3D &extent3D) + : m_vkExtent3D(extent3D.m_vkExtent3D) +{} + +VulkanExtent3D::VulkanExtent3D(uint32_t width, uint32_t height, uint32_t depth) +{ + m_vkExtent3D.width = width; + m_vkExtent3D.height = height; + m_vkExtent3D.depth = depth; +} + +VulkanExtent3D::~VulkanExtent3D() {} + +uint32_t VulkanExtent3D::getWidth() const { return m_vkExtent3D.width; } + +uint32_t VulkanExtent3D::getHeight() const { return m_vkExtent3D.height; } + +uint32_t VulkanExtent3D::getDepth() const { return m_vkExtent3D.depth; } + +VulkanExtent3D::operator VkExtent3D() const { return m_vkExtent3D; } + +////////////////////////////////////// +// VulkanCommandPool implementation // +////////////////////////////////////// + +VulkanCommandPool::VulkanCommandPool(const VulkanCommandPool &commandPool) + : m_device(commandPool.m_device), + m_vkCommandPool(commandPool.m_vkCommandPool) +{} + +VulkanCommandPool::VulkanCommandPool(const VulkanDevice &device, + const VulkanQueueFamily &queueFamily) + : m_device(device) +{ + VkCommandPoolCreateInfo vkCommandPoolCreateInfo = {}; + vkCommandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + vkCommandPoolCreateInfo.pNext = NULL; + vkCommandPoolCreateInfo.flags = + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + vkCommandPoolCreateInfo.queueFamilyIndex = queueFamily; + + vkCreateCommandPool(m_device, &vkCommandPoolCreateInfo, NULL, + &m_vkCommandPool); +} + +VulkanCommandPool::~VulkanCommandPool() +{ + vkDestroyCommandPool(m_device, m_vkCommandPool, NULL); +} + +VulkanCommandPool::operator VkCommandPool() const { return m_vkCommandPool; } + +//////////////////////////////////////// +// VulkanCommandBuffer implementation // +//////////////////////////////////////// + +VulkanCommandBuffer::VulkanCommandBuffer( + const VulkanCommandBuffer &commandBuffer) + : m_device(commandBuffer.m_device), + m_commandPool(commandBuffer.m_commandPool), + m_vkCommandBuffer(commandBuffer.m_vkCommandBuffer) +{} + +VulkanCommandBuffer::VulkanCommandBuffer(const VulkanDevice &device, + const VulkanCommandPool &commandPool) + : m_device(device), m_commandPool(commandPool) +{ + VkCommandBufferAllocateInfo vkCommandBufferAllocateInfo = {}; + vkCommandBufferAllocateInfo.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + vkCommandBufferAllocateInfo.pNext = NULL; + vkCommandBufferAllocateInfo.commandPool = commandPool; + vkCommandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + vkCommandBufferAllocateInfo.commandBufferCount = 1; + + vkAllocateCommandBuffers(m_device, &vkCommandBufferAllocateInfo, + &m_vkCommandBuffer); +} + +VulkanCommandBuffer::~VulkanCommandBuffer() +{ + vkFreeCommandBuffers(m_device, m_commandPool, 1, &m_vkCommandBuffer); +} + +void VulkanCommandBuffer::begin() +{ + VkCommandBufferBeginInfo vkCommandBufferBeginInfo = {}; + vkCommandBufferBeginInfo.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + vkCommandBufferBeginInfo.pNext = NULL; + vkCommandBufferBeginInfo.flags = + VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + vkCommandBufferBeginInfo.pInheritanceInfo = NULL; + + vkBeginCommandBuffer(m_vkCommandBuffer, &vkCommandBufferBeginInfo); +} + +void VulkanCommandBuffer::bindPipeline(const VulkanPipeline &pipeline) +{ + VkPipelineBindPoint vkPipelineBindPoint = + (VkPipelineBindPoint)pipeline.getPipelineBindPoint(); + + vkCmdBindPipeline(m_vkCommandBuffer, vkPipelineBindPoint, pipeline); +} + +void VulkanCommandBuffer::bindDescriptorSets( + const VulkanPipeline &pipeline, const VulkanPipelineLayout &pipelineLayout, + const VulkanDescriptorSet &descriptorSet) +{ + VkPipelineBindPoint vkPipelineBindPoint = + (VkPipelineBindPoint)pipeline.getPipelineBindPoint(); + VkDescriptorSet vkDescriptorSet = descriptorSet; + + vkCmdBindDescriptorSets(m_vkCommandBuffer, vkPipelineBindPoint, + pipelineLayout, 0, 1, &vkDescriptorSet, 0, NULL); +} + +void VulkanCommandBuffer::pipelineBarrier(const VulkanImage2DList &image2DList, + VulkanImageLayout oldImageLayout, + VulkanImageLayout newImageLayout) +{ + std::vector<VkImageMemoryBarrier> vkImageMemoryBarrierList; + for (size_t i2DIdx = 0; i2DIdx < image2DList.size(); i2DIdx++) + { + VkImageSubresourceRange vkImageSubresourceRange = {}; + vkImageSubresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + vkImageSubresourceRange.baseMipLevel = 0; + vkImageSubresourceRange.levelCount = VK_REMAINING_MIP_LEVELS; + vkImageSubresourceRange.baseArrayLayer = 0; + vkImageSubresourceRange.layerCount = VK_REMAINING_ARRAY_LAYERS; + + VkImageMemoryBarrier vkImageMemoryBarrier = {}; + vkImageMemoryBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + vkImageMemoryBarrier.pNext = NULL; + vkImageMemoryBarrier.srcAccessMask = 0; + vkImageMemoryBarrier.dstAccessMask = 0; + vkImageMemoryBarrier.oldLayout = (VkImageLayout)oldImageLayout; + vkImageMemoryBarrier.newLayout = (VkImageLayout)newImageLayout; + vkImageMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + vkImageMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + vkImageMemoryBarrier.image = image2DList[i2DIdx]; + vkImageMemoryBarrier.subresourceRange = vkImageSubresourceRange; + + vkImageMemoryBarrierList.push_back(vkImageMemoryBarrier); + } + + vkCmdPipelineBarrier(m_vkCommandBuffer, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, 0, NULL, 0, + NULL, (uint32_t)vkImageMemoryBarrierList.size(), + vkImageMemoryBarrierList.data()); +} + +void VulkanCommandBuffer::dispatch(uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ) +{ + vkCmdDispatch(m_vkCommandBuffer, groupCountX, groupCountY, groupCountZ); +} + +void VulkanCommandBuffer::fillBuffer(const VulkanBuffer &buffer, uint32_t data, + uint64_t offset, uint64_t size) +{ + vkCmdFillBuffer(m_vkCommandBuffer, buffer, offset, size, data); +} + +void VulkanCommandBuffer::updateBuffer(const VulkanBuffer &buffer, void *pdata, + uint64_t offset, uint64_t size) +{ + vkCmdUpdateBuffer(m_vkCommandBuffer, buffer, offset, size, pdata); +} + +void VulkanCommandBuffer::copyBufferToImage(const VulkanBuffer &buffer, + const VulkanImage &image, + VulkanImageLayout imageLayout) +{ + VkDeviceSize bufferOffset = 0; + + std::vector<VkBufferImageCopy> vkBufferImageCopyList; + for (uint32_t mipLevel = 0; mipLevel < image.getNumMipLevels(); mipLevel++) + { + VulkanExtent3D extent3D = image.getExtent3D(mipLevel); + size_t elementSize = getVulkanFormatElementSize(image.getFormat()); + + VkImageSubresourceLayers vkImageSubresourceLayers = {}; + vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + vkImageSubresourceLayers.mipLevel = mipLevel; + vkImageSubresourceLayers.baseArrayLayer = 0; + vkImageSubresourceLayers.layerCount = image.getNumLayers(); + + VkBufferImageCopy vkBufferImageCopy = {}; + vkBufferImageCopy.bufferOffset = bufferOffset; + vkBufferImageCopy.bufferRowLength = 0; + vkBufferImageCopy.bufferImageHeight = 0; + vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers; + vkBufferImageCopy.imageOffset = VulkanOffset3D(0, 0, 0); + vkBufferImageCopy.imageExtent = extent3D; + + vkBufferImageCopyList.push_back(vkBufferImageCopy); + + bufferOffset += extent3D.getWidth() * extent3D.getHeight() + * extent3D.getDepth() * elementSize; + bufferOffset = + ROUND_UP(bufferOffset, + std::max(elementSize, + (size_t)VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT)); + } + + vkCmdCopyBufferToImage( + m_vkCommandBuffer, buffer, image, (VkImageLayout)imageLayout, + (uint32_t)vkBufferImageCopyList.size(), vkBufferImageCopyList.data()); +} + +void VulkanCommandBuffer::copyBufferToImage( + const VulkanBuffer &buffer, const VulkanImage &image, uint64_t bufferOffset, + uint32_t mipLevel, uint32_t baseArrayLayer, uint32_t layerCount, + VulkanOffset3D offset3D, VulkanExtent3D extent3D) +{ + VkImageSubresourceLayers vkImageSubresourceLayers = {}; + vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + vkImageSubresourceLayers.mipLevel = mipLevel; + vkImageSubresourceLayers.baseArrayLayer = baseArrayLayer; + vkImageSubresourceLayers.layerCount = layerCount; + + VkExtent3D vkExtent3D = extent3D; + if ((extent3D.getWidth() == 0) && (extent3D.getHeight() == 0) + && (extent3D.getDepth() == 0)) + { + vkExtent3D = image.getExtent3D(mipLevel); + } + + VkBufferImageCopy vkBufferImageCopy = {}; + vkBufferImageCopy.bufferOffset = bufferOffset; + vkBufferImageCopy.bufferRowLength = 0; + vkBufferImageCopy.bufferImageHeight = 0; + vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers; + vkBufferImageCopy.imageOffset = offset3D; + vkBufferImageCopy.imageExtent = vkExtent3D; + + vkCmdCopyBufferToImage(m_vkCommandBuffer, buffer, image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, + &vkBufferImageCopy); +} + +void VulkanCommandBuffer::copyImageToBuffer( + const VulkanImage &image, const VulkanBuffer &buffer, uint64_t bufferOffset, + uint32_t mipLevel, uint32_t baseArrayLayer, uint32_t layerCount, + VulkanOffset3D offset3D, VulkanExtent3D extent3D) +{ + VkImageSubresourceLayers vkImageSubresourceLayers = {}; + vkImageSubresourceLayers.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + vkImageSubresourceLayers.mipLevel = mipLevel; + vkImageSubresourceLayers.baseArrayLayer = baseArrayLayer; + vkImageSubresourceLayers.layerCount = layerCount; + + VkExtent3D vkExtent3D = extent3D; + if ((extent3D.getWidth() == 0) && (extent3D.getHeight() == 0) + && (extent3D.getDepth() == 0)) + { + vkExtent3D = image.getExtent3D(mipLevel); + } + + VkBufferImageCopy vkBufferImageCopy = {}; + vkBufferImageCopy.bufferOffset = bufferOffset; + vkBufferImageCopy.bufferRowLength = 0; + vkBufferImageCopy.bufferImageHeight = 0; + vkBufferImageCopy.imageSubresource = vkImageSubresourceLayers; + vkBufferImageCopy.imageOffset = offset3D; + vkBufferImageCopy.imageExtent = vkExtent3D; + + vkCmdCopyImageToBuffer(m_vkCommandBuffer, image, VK_IMAGE_LAYOUT_GENERAL, + buffer, 1, &vkBufferImageCopy); +} + +void VulkanCommandBuffer::end() { vkEndCommandBuffer(m_vkCommandBuffer); } + +VulkanCommandBuffer::operator VkCommandBuffer() const +{ + return m_vkCommandBuffer; +} + +///////////////////////////////// +// VulkanBuffer implementation // +///////////////////////////////// + +VulkanBuffer::VulkanBuffer(const VulkanBuffer &buffer) + : m_device(buffer.m_device), m_vkBuffer(buffer.m_vkBuffer), + m_size(buffer.m_size), m_alignment(buffer.m_alignment), + m_memoryTypeList(buffer.m_memoryTypeList) +{} + +VulkanBuffer::VulkanBuffer( + const VulkanDevice &device, uint64_t size, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanBufferUsage bufferUsage, VulkanSharingMode sharingMode, + const VulkanQueueFamilyList &queueFamilyList) + : m_device(device), m_vkBuffer(VK_NULL_HANDLE) +{ + std::vector<uint32_t> queueFamilyIndexList; + if (queueFamilyList.size() == 0) + { + for (size_t qfIdx = 0; + qfIdx < device.getPhysicalDevice().getQueueFamilyList().size(); + qfIdx++) + { + queueFamilyIndexList.push_back( + device.getPhysicalDevice().getQueueFamilyList()[qfIdx]); + } + } + else + { + for (size_t qfIdx = 0; qfIdx < queueFamilyList.size(); qfIdx++) + { + queueFamilyIndexList.push_back(queueFamilyList[qfIdx]); + } + } + + VkBufferCreateInfo vkBufferCreateInfo = {}; + vkBufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + vkBufferCreateInfo.pNext = NULL; + vkBufferCreateInfo.flags = 0; + vkBufferCreateInfo.size = (VkDeviceSize)size; + vkBufferCreateInfo.usage = (VkBufferUsageFlags)bufferUsage; + vkBufferCreateInfo.sharingMode = (VkSharingMode)sharingMode; + vkBufferCreateInfo.queueFamilyIndexCount = + (uint32_t)queueFamilyIndexList.size(); + vkBufferCreateInfo.pQueueFamilyIndices = queueFamilyIndexList.data(); + + VkExternalMemoryBufferCreateInfo vkExternalMemoryBufferCreateInfo = {}; + if (externalMemoryHandleType != VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE) + { + vkExternalMemoryBufferCreateInfo.sType = + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR; + vkExternalMemoryBufferCreateInfo.pNext = NULL; + vkExternalMemoryBufferCreateInfo.handleTypes = + (VkExternalMemoryHandleTypeFlags)externalMemoryHandleType; + + vkBufferCreateInfo.pNext = &vkExternalMemoryBufferCreateInfo; + } + + vkCreateBuffer(m_device, &vkBufferCreateInfo, NULL, &m_vkBuffer); + + VkMemoryRequirements vkMemoryRequirements = {}; + vkGetBufferMemoryRequirements(m_device, m_vkBuffer, &vkMemoryRequirements); + m_size = vkMemoryRequirements.size; + m_alignment = vkMemoryRequirements.alignment; + const VulkanMemoryTypeList &memoryTypeList = + m_device.getPhysicalDevice().getMemoryTypeList(); + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + uint32_t memoryTypeIndex = memoryTypeList[mtIdx]; + if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits) + { + m_memoryTypeList.add(memoryTypeList[mtIdx]); + } + } +} + +VulkanBuffer::~VulkanBuffer() { vkDestroyBuffer(m_device, m_vkBuffer, NULL); } + +uint64_t VulkanBuffer::getSize() const { return m_size; } + +uint64_t VulkanBuffer::getAlignment() const { return m_alignment; } + +const VulkanMemoryTypeList &VulkanBuffer::getMemoryTypeList() const +{ + return m_memoryTypeList; +} + +VulkanBuffer::operator VkBuffer() const { return m_vkBuffer; } + +//////////////////////////////// +// VulkanImage implementation // +//////////////////////////////// + +VulkanImage::VulkanImage(const VulkanImage &image) + : m_device(image.m_device), m_imageType(image.m_imageType), + m_extent3D(image.m_extent3D), m_format(image.m_format), + m_numMipLevels(image.m_numMipLevels), m_numLayers(image.m_numLayers), + m_vkImage(image.m_vkImage), m_size(image.m_size), + m_alignment(image.m_alignment), m_memoryTypeList(image.m_memoryTypeList) +{} + +VulkanImage::VulkanImage( + const VulkanDevice &device, VulkanImageType imageType, VulkanFormat format, + const VulkanExtent3D &extent3D, uint32_t numMipLevels, uint32_t arrayLayers, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanImageCreateFlag imageCreateFlag, VulkanImageTiling imageTiling, + VulkanImageUsage imageUsage, VulkanSharingMode sharingMode) + : m_device(device), m_imageType(imageType), m_extent3D(extent3D), + m_format(format), m_numMipLevels(numMipLevels), m_numLayers(arrayLayers), + m_vkImage(VK_NULL_HANDLE) +{ + VkImageCreateInfo vkImageCreateInfo = {}; + vkImageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + vkImageCreateInfo.pNext = NULL; + vkImageCreateInfo.flags = (VkImageCreateFlags)imageCreateFlag; + vkImageCreateInfo.imageType = (VkImageType)imageType; + vkImageCreateInfo.format = (VkFormat)format; + vkImageCreateInfo.extent = extent3D; + vkImageCreateInfo.mipLevels = numMipLevels; + vkImageCreateInfo.arrayLayers = arrayLayers; + vkImageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + vkImageCreateInfo.tiling = (VkImageTiling)imageTiling; + vkImageCreateInfo.usage = (VkImageUsageFlags)imageUsage; + vkImageCreateInfo.sharingMode = (VkSharingMode)sharingMode; + vkImageCreateInfo.queueFamilyIndexCount = + (uint32_t)m_device.getPhysicalDevice().getQueueFamilyList().size(); + vkImageCreateInfo.pQueueFamilyIndices = + m_device.getPhysicalDevice().getQueueFamilyList()(); + vkImageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkExternalMemoryImageCreateInfo vkExternalMemoryImageCreateInfo = {}; + if (externalMemoryHandleType != VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE) + { + vkExternalMemoryImageCreateInfo.sType = + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO; + vkExternalMemoryImageCreateInfo.pNext = NULL; + vkExternalMemoryImageCreateInfo.handleTypes = + (VkExternalMemoryHandleTypeFlags)externalMemoryHandleType; + + vkImageCreateInfo.pNext = &vkExternalMemoryImageCreateInfo; + } + + vkCreateImage(m_device, &vkImageCreateInfo, NULL, &m_vkImage); + VulkanImageCreateInfo = vkImageCreateInfo; + VkMemoryRequirements vkMemoryRequirements = {}; + vkGetImageMemoryRequirements(m_device, m_vkImage, &vkMemoryRequirements); + m_size = vkMemoryRequirements.size; + m_alignment = vkMemoryRequirements.alignment; + const VulkanMemoryTypeList &memoryTypeList = + m_device.getPhysicalDevice().getMemoryTypeList(); + for (size_t mtIdx = 0; mtIdx < memoryTypeList.size(); mtIdx++) + { + uint32_t memoryTypeIndex = memoryTypeList[mtIdx]; + if ((1 << memoryTypeIndex) & vkMemoryRequirements.memoryTypeBits) + { + m_memoryTypeList.add(memoryTypeList[mtIdx]); + } + } +} + +VulkanImage::~VulkanImage() { vkDestroyImage(m_device, m_vkImage, NULL); } + +VulkanExtent3D VulkanImage::getExtent3D(uint32_t mipLevel) const +{ + return VulkanExtent3D(0, 0, 0); +} + +VulkanFormat VulkanImage::getFormat() const { return m_format; } + +VkImageCreateInfo VulkanImage::getVkImageCreateInfo() const +{ + return VulkanImageCreateInfo; +} + +uint32_t VulkanImage::getNumMipLevels() const { return m_numMipLevels; } + +uint32_t VulkanImage::getNumLayers() const { return m_numLayers; } + +uint64_t VulkanImage::getSize() const { return m_size; } + +uint64_t VulkanImage::getAlignment() const { return m_alignment; } + +const VulkanMemoryTypeList &VulkanImage::getMemoryTypeList() const +{ + return m_memoryTypeList; +} + +VulkanImage::operator VkImage() const { return m_vkImage; } + +////////////////////////////////// +// VulkanImage2D implementation // +////////////////////////////////// + +VulkanImage2D::VulkanImage2D(const VulkanImage2D &image2D): VulkanImage(image2D) +{} + +VulkanImage2D::VulkanImage2D( + const VulkanDevice &device, VulkanFormat format, uint32_t width, + uint32_t height, uint32_t numMipLevels, + VulkanExternalMemoryHandleType externalMemoryHandleType, + VulkanImageCreateFlag imageCreateFlag, VulkanImageUsage imageUsage, + VulkanSharingMode sharingMode) + : VulkanImage(device, VULKAN_IMAGE_TYPE_2D, format, + VulkanExtent3D(width, height, 1), numMipLevels, 1, + externalMemoryHandleType, imageCreateFlag, + VULKAN_IMAGE_TILING_OPTIMAL, imageUsage, sharingMode) +{} + +VulkanImage2D::~VulkanImage2D() {} + +VulkanExtent3D VulkanImage2D::getExtent3D(uint32_t mipLevel) const +{ + uint32_t width = std::max(m_extent3D.getWidth() >> mipLevel, uint32_t(1)); + uint32_t height = std::max(m_extent3D.getHeight() >> mipLevel, uint32_t(1)); + uint32_t depth = 1; + + return VulkanExtent3D(width, height, depth); +} + +//////////////////////////////////// +// VulkanImageView implementation // +//////////////////////////////////// + +VulkanImageView::VulkanImageView(const VulkanImageView &imageView) + : m_device(imageView.m_device), m_vkImageView(imageView.m_vkImageView) +{} + +VulkanImageView::VulkanImageView(const VulkanDevice &device, + const VulkanImage &image, + VulkanImageViewType imageViewType, + uint32_t baseMipLevel, uint32_t levelCount, + uint32_t baseArrayLayer, uint32_t layerCount) + : m_device(device), m_vkImageView(VK_NULL_HANDLE) +{ + VkComponentMapping vkComponentMapping = {}; + vkComponentMapping.r = VK_COMPONENT_SWIZZLE_IDENTITY; + vkComponentMapping.g = VK_COMPONENT_SWIZZLE_IDENTITY; + vkComponentMapping.b = VK_COMPONENT_SWIZZLE_IDENTITY; + vkComponentMapping.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + VkImageSubresourceRange vkImageSubresourceRange = {}; + vkImageSubresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + vkImageSubresourceRange.baseMipLevel = baseMipLevel; + vkImageSubresourceRange.levelCount = levelCount; + vkImageSubresourceRange.baseArrayLayer = baseArrayLayer; + vkImageSubresourceRange.layerCount = layerCount; + + VkImageViewCreateInfo vkImageViewCreateInfo = {}; + vkImageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + vkImageViewCreateInfo.pNext = NULL; + vkImageViewCreateInfo.flags = 0; + vkImageViewCreateInfo.image = image; + vkImageViewCreateInfo.viewType = (VkImageViewType)imageViewType; + vkImageViewCreateInfo.format = (VkFormat)image.getFormat(); + vkImageViewCreateInfo.components = vkComponentMapping; + vkImageViewCreateInfo.subresourceRange = vkImageSubresourceRange; + + vkCreateImageView(m_device, &vkImageViewCreateInfo, NULL, &m_vkImageView); +} + +VulkanImageView::~VulkanImageView() +{ + vkDestroyImageView(m_device, m_vkImageView, NULL); +} + +VulkanImageView::operator VkImageView() const { return m_vkImageView; } + +/////////////////////////////////////// +// VulkanDeviceMemory implementation // +/////////////////////////////////////// + +#if defined(_WIN32) || defined(_WIN64) + +class WindowsSecurityAttributes { +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; + +public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES *operator&(); + ~WindowsSecurityAttributes(); +}; + + +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc( + 1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void **)); + // CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL); + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + InitializeSecurityDescriptor(m_winPSecurityDescriptor, + SECURITY_DESCRIPTOR_REVISION); + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = + SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, + 0, 0, 0, 0, 0, 0, ppSID); + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = + STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; +} + +SECURITY_ATTRIBUTES *WindowsSecurityAttributes::operator&() +{ + return &m_winSecurityAttributes; +} + +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + if (*ppSID) + { + FreeSid(*ppSID); + } + if (*ppACL) + { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); +} + +#endif + +VulkanDeviceMemory::VulkanDeviceMemory(const VulkanDeviceMemory &deviceMemory) + : m_device(deviceMemory.m_device), + m_vkDeviceMemory(deviceMemory.m_vkDeviceMemory), + m_size(deviceMemory.m_size), m_isDedicated(deviceMemory.m_isDedicated) +{} + +VulkanDeviceMemory::VulkanDeviceMemory( + const VulkanDevice &device, uint64_t size, + const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name) + : m_device(device), m_size(size), m_isDedicated(false) +{ +#if defined(_WIN32) || defined(_WIN64) + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vkExportMemoryWin32HandleInfoKHR = {}; + vkExportMemoryWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vkExportMemoryWin32HandleInfoKHR.pNext = NULL; + vkExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vkExportMemoryWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vkExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)name; + +#endif + + VkExportMemoryAllocateInfoKHR vkExportMemoryAllocateInfoKHR = {}; + vkExportMemoryAllocateInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#if defined(_WIN32) || defined(_WIN64) + vkExportMemoryAllocateInfoKHR.pNext = externalMemoryHandleType + & VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT + ? &vkExportMemoryWin32HandleInfoKHR + : NULL; +#else + vkExportMemoryAllocateInfoKHR.pNext = NULL; +#endif + vkExportMemoryAllocateInfoKHR.handleTypes = + (VkExternalMemoryHandleTypeFlagsKHR)externalMemoryHandleType; + + VkMemoryAllocateInfo vkMemoryAllocateInfo = {}; + vkMemoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + vkMemoryAllocateInfo.pNext = + externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL; + vkMemoryAllocateInfo.allocationSize = m_size; + vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType; + + vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL, &m_vkDeviceMemory); +} + +VulkanDeviceMemory::VulkanDeviceMemory( + const VulkanDevice &device, const VulkanImage &image, + const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType, const void *name) + : m_device(device), m_size(image.getSize()), m_isDedicated(true) +{ +#if defined(_WIN32) || defined(_WIN64) + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vkExportMemoryWin32HandleInfoKHR = {}; + vkExportMemoryWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vkExportMemoryWin32HandleInfoKHR.pNext = NULL; + vkExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vkExportMemoryWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vkExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)name; + +#endif + + VkExportMemoryAllocateInfoKHR vkExportMemoryAllocateInfoKHR = {}; + vkExportMemoryAllocateInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#if defined(_WIN32) || defined(_WIN64) + vkExportMemoryAllocateInfoKHR.pNext = externalMemoryHandleType + & VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT + ? &vkExportMemoryWin32HandleInfoKHR + : NULL; +#else + vkExportMemoryAllocateInfoKHR.pNext = NULL; +#endif + vkExportMemoryAllocateInfoKHR.handleTypes = + (VkExternalMemoryHandleTypeFlagsKHR)externalMemoryHandleType; + + VkMemoryDedicatedAllocateInfo vkMemoryDedicatedAllocateInfo = {}; + vkMemoryDedicatedAllocateInfo.sType = + VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO; + vkMemoryDedicatedAllocateInfo.pNext = + externalMemoryHandleType ? &vkExportMemoryAllocateInfoKHR : NULL; + vkMemoryDedicatedAllocateInfo.image = image; + vkMemoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE; + + VkMemoryAllocateInfo vkMemoryAllocateInfo = {}; + vkMemoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + vkMemoryAllocateInfo.pNext = &vkMemoryDedicatedAllocateInfo; + vkMemoryAllocateInfo.allocationSize = m_size; + vkMemoryAllocateInfo.memoryTypeIndex = (uint32_t)memoryType; + + vkAllocateMemory(m_device, &vkMemoryAllocateInfo, NULL, &m_vkDeviceMemory); +} + +VulkanDeviceMemory::~VulkanDeviceMemory() +{ + vkFreeMemory(m_device, m_vkDeviceMemory, NULL); +} + +uint64_t VulkanDeviceMemory::getSize() const { return m_size; } + +#ifdef _WIN32 +HANDLE VulkanDeviceMemory::getHandle( + VulkanExternalMemoryHandleType externalMemoryHandleType) const +{ + HANDLE handle; + + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = m_vkDeviceMemory; + vkMemoryGetWin32HandleInfoKHR.handleType = + (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType; + + vkGetMemoryWin32HandleKHR(m_device, &vkMemoryGetWin32HandleInfoKHR, + &handle); + + return handle; +} +#else +int VulkanDeviceMemory::getHandle( + VulkanExternalMemoryHandleType externalMemoryHandleType) const +{ + if (externalMemoryHandleType + == VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD) + { + int fd; + + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = m_vkDeviceMemory; + vkMemoryGetFdInfoKHR.handleType = + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + vkGetMemoryFdKHR(m_device, &vkMemoryGetFdInfoKHR, &fd); + + return fd; + } + return HANDLE_ERROR; +} +#endif + +bool VulkanDeviceMemory::isDedicated() const { return m_isDedicated; } + +void *VulkanDeviceMemory::map(size_t offset, size_t size) +{ + void *pData; + + vkMapMemory(m_device, m_vkDeviceMemory, (VkDeviceSize)offset, + (VkDeviceSize)size, 0, &pData); + + return pData; +} + +void VulkanDeviceMemory::unmap() { vkUnmapMemory(m_device, m_vkDeviceMemory); } + +void VulkanDeviceMemory::bindBuffer(const VulkanBuffer &buffer, uint64_t offset) +{ + vkBindBufferMemory(m_device, buffer, m_vkDeviceMemory, offset); +} + +void VulkanDeviceMemory::bindImage(const VulkanImage &image, uint64_t offset) +{ + vkBindImageMemory(m_device, image, m_vkDeviceMemory, offset); +} + +VulkanDeviceMemory::operator VkDeviceMemory() const { return m_vkDeviceMemory; } + +//////////////////////////////////// +// VulkanSemaphore implementation // +//////////////////////////////////// + +VulkanSemaphore::VulkanSemaphore(const VulkanSemaphore &semaphore) + : m_device(semaphore.m_device), m_vkSemaphore(semaphore.m_vkSemaphore) +{} + +VulkanSemaphore::VulkanSemaphore( + const VulkanDevice &device, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType, + const std::wstring name) + : m_device(device), m_name(name) +{ +#if defined(_WIN32) || defined(_WIN64) + WindowsSecurityAttributes winSecurityAttributes; + + VkExportSemaphoreWin32HandleInfoKHR + vkExportSemaphoreWin32HandleInfoKHR = {}; + vkExportSemaphoreWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + vkExportSemaphoreWin32HandleInfoKHR.pNext = NULL; + vkExportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vkExportSemaphoreWin32HandleInfoKHR.dwAccess = + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vkExportSemaphoreWin32HandleInfoKHR.name = + m_name.size() ? (LPCWSTR)m_name.c_str() : NULL; +#endif + + VkExportSemaphoreCreateInfoKHR vkExportSemaphoreCreateInfoKHR = {}; + vkExportSemaphoreCreateInfoKHR.sType = + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +#if defined(_WIN32) || defined(_WIN64) + vkExportSemaphoreCreateInfoKHR.pNext = + (externalSemaphoreHandleType + & VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT) + ? &vkExportSemaphoreWin32HandleInfoKHR + : NULL; +#else + vkExportSemaphoreCreateInfoKHR.pNext = NULL; +#endif + vkExportSemaphoreCreateInfoKHR.handleTypes = + (VkExternalSemaphoreHandleTypeFlagsKHR)externalSemaphoreHandleType; + + VkSemaphoreCreateInfo vkSemaphoreCreateInfo = {}; + vkSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + vkSemaphoreCreateInfo.pNext = + (externalSemaphoreHandleType + != VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE) + ? &vkExportSemaphoreCreateInfoKHR + : NULL; + vkSemaphoreCreateInfo.flags = 0; + + vkCreateSemaphore(m_device, &vkSemaphoreCreateInfo, NULL, &m_vkSemaphore); +} + +VulkanSemaphore::~VulkanSemaphore() +{ + vkDestroySemaphore(m_device, m_vkSemaphore, NULL); +} + +#if defined(_WIN32) || defined(_WIN64) +HANDLE VulkanSemaphore::getHandle( + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const +{ + HANDLE handle; + + VkSemaphoreGetWin32HandleInfoKHR vkSemaphoreGetWin32HandleInfoKHR = {}; + vkSemaphoreGetWin32HandleInfoKHR.sType = + VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + vkSemaphoreGetWin32HandleInfoKHR.pNext = NULL; + vkSemaphoreGetWin32HandleInfoKHR.semaphore = m_vkSemaphore; + vkSemaphoreGetWin32HandleInfoKHR.handleType = + (VkExternalSemaphoreHandleTypeFlagBitsKHR)externalSemaphoreHandleType; + + vkGetSemaphoreWin32HandleKHR(m_device, &vkSemaphoreGetWin32HandleInfoKHR, + &handle); + + return handle; +} +#else +int VulkanSemaphore::getHandle( + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const +{ + if (externalSemaphoreHandleType + == VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD) + { + int fd; + + VkSemaphoreGetFdInfoKHR vkSemaphoreGetFdInfoKHR = {}; + vkSemaphoreGetFdInfoKHR.sType = + VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + vkSemaphoreGetFdInfoKHR.pNext = NULL; + vkSemaphoreGetFdInfoKHR.semaphore = m_vkSemaphore; + vkSemaphoreGetFdInfoKHR.handleType = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + vkGetSemaphoreFdKHR(m_device, &vkSemaphoreGetFdInfoKHR, &fd); + + return fd; + } + return HANDLE_ERROR; +} +#endif + +const std::wstring &VulkanSemaphore::getName() const { return m_name; } + +VulkanSemaphore::operator VkSemaphore() const { return m_vkSemaphore; } diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp new file mode 100644 index 00000000..37925ee4 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper.hpp @@ -0,0 +1,580 @@ +// +// Copyright (c) 2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef _vulkan_wrapper_hpp_ +#define _vulkan_wrapper_hpp_ + +#include <vulkan/vulkan.h> +#include "vulkan_wrapper_types.hpp" +#include "vulkan_list_map.hpp" +#include "vulkan_api_list.hpp" + +class VulkanInstance { + friend const VulkanInstance &getVulkanInstance(); + +protected: + VkInstance m_vkInstance; + VulkanPhysicalDeviceList m_physicalDeviceList; + + VulkanInstance(); + VulkanInstance(const VulkanInstance &); + virtual ~VulkanInstance(); + +public: + const VulkanPhysicalDeviceList &getPhysicalDeviceList() const; + operator VkInstance() const; +}; + +class VulkanPhysicalDevice { + friend class VulkanInstance; + +protected: + VkPhysicalDevice m_vkPhysicalDevice; + VkPhysicalDeviceProperties m_vkPhysicalDeviceProperties; + uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; + uint8_t m_vkDeviceLUID[VK_LUID_SIZE]; + uint32_t m_vkDeviceNodeMask; + VkPhysicalDeviceFeatures m_vkPhysicalDeviceFeatures; + VkPhysicalDeviceMemoryProperties m_vkPhysicalDeviceMemoryProperties; + VulkanQueueFamilyList m_queueFamilyList; + VulkanMemoryHeapList m_memoryHeapList; + VulkanMemoryTypeList m_memoryTypeList; + + VulkanPhysicalDevice(const VulkanPhysicalDevice &physicalDevice); + VulkanPhysicalDevice(VkPhysicalDevice vkPhysicalDevice); + virtual ~VulkanPhysicalDevice(); + +public: + const VulkanQueueFamilyList &getQueueFamilyList() const; + const VulkanMemoryHeapList &getMemoryHeapList() const; + const VulkanMemoryTypeList &getMemoryTypeList() const; + const uint8_t *getUUID() const; + const uint8_t *getLUID() const; + uint32_t getNodeMask() const; + operator VkPhysicalDevice() const; +}; + +class VulkanMemoryHeap { + friend class VulkanPhysicalDevice; + +protected: + uint32_t m_memoryHeapIndex; + uint64_t m_size; + VulkanMemoryHeapFlag m_memoryHeapFlag; + + VulkanMemoryHeap(const VulkanMemoryHeap &memoryHeap); + VulkanMemoryHeap(uint32_t m_memoryHeapIndex, uint64_t m_size, + VulkanMemoryHeapFlag m_memoryHeapFlag); + virtual ~VulkanMemoryHeap(); + +public: + uint64_t getSize() const; + VulkanMemoryHeapFlag getMemoryHeapFlag() const; + operator uint32_t() const; +}; + +class VulkanMemoryType { + friend class VulkanPhysicalDevice; + +protected: + uint32_t m_memoryTypeIndex; + const VulkanMemoryTypeProperty m_memoryTypeProperty; + const VulkanMemoryHeap &m_memoryHeap; + + VulkanMemoryType(const VulkanMemoryType &memoryType); + VulkanMemoryType(uint32_t memoryTypeIndex, + VulkanMemoryTypeProperty memoryTypeProperty, + const VulkanMemoryHeap &memoryHeap); + virtual ~VulkanMemoryType(); + +public: + VulkanMemoryTypeProperty getMemoryTypeProperty() const; + const VulkanMemoryHeap &getMemoryHeap() const; + operator uint32_t() const; +}; + +class VulkanQueueFamily { + friend class VulkanPhysicalDevice; + +protected: + uint32_t m_queueFamilyIndex; + VkQueueFamilyProperties m_vkQueueFamilyProperties; + + VulkanQueueFamily(const VulkanQueueFamily &queueFamily); + VulkanQueueFamily(uint32_t queueFamilyIndex, + VkQueueFamilyProperties vkQueueFamilyProperties); + virtual ~VulkanQueueFamily(); + +public: + uint32_t getQueueFlags() const; + uint32_t getQueueCount() const; + operator uint32_t() const; +}; + +class VulkanDevice { +protected: + const VulkanPhysicalDevice &m_physicalDevice; + VkDevice m_vkDevice; + VulkanQueueFamilyToQueueListMap m_queueFamilyIndexToQueueListMap; + + VulkanDevice(const VulkanDevice &device); + +public: + VulkanDevice( + const VulkanPhysicalDevice &physicalDevice = getVulkanPhysicalDevice(), + const VulkanQueueFamilyToQueueCountMap &queueFamilyToQueueCountMap = + getDefaultVulkanQueueFamilyToQueueCountMap()); + virtual ~VulkanDevice(); + const VulkanPhysicalDevice &getPhysicalDevice() const; + VulkanQueue & + getQueue(const VulkanQueueFamily &queueFamily = getVulkanQueueFamily(), + uint32_t queueIndex = 0); + operator VkDevice() const; +}; + +class VulkanQueue { + friend class VulkanDevice; + +protected: + VkQueue m_vkQueue; + + VulkanQueue(VkQueue vkQueue); + VulkanQueue(const VulkanQueue &queue); + virtual ~VulkanQueue(); + +public: + const VulkanQueueFamily &getQueueFamily(); + void submit(const VulkanSemaphoreList &waitSemaphoreList, + const VulkanCommandBufferList &commandBufferList, + const VulkanSemaphoreList &signalSemaphoreList); + void submit(const VulkanSemaphore &waitSemaphore, + const VulkanCommandBuffer &commandBuffer, + const VulkanSemaphore &signalSemaphore); + void submit(const VulkanCommandBuffer &commandBuffer, + const VulkanSemaphore &signalSemaphore); + void submit(const VulkanCommandBuffer &commandBuffer); + void waitIdle(); + operator VkQueue() const; +}; + +class VulkanDescriptorSetLayoutBinding { +protected: + VkDescriptorSetLayoutBinding m_vkDescriptorSetLayoutBinding; + + VulkanDescriptorSetLayoutBinding( + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding); + +public: + VulkanDescriptorSetLayoutBinding( + uint32_t binding, VulkanDescriptorType descriptorType, + uint32_t descriptorCount = 1, + VulkanShaderStage shaderStage = VULKAN_SHADER_STAGE_COMPUTE); + virtual ~VulkanDescriptorSetLayoutBinding(); + operator VkDescriptorSetLayoutBinding() const; +}; + +class VulkanDescriptorSetLayout { +protected: + const VulkanDevice &m_device; + VkDescriptorSetLayout m_vkDescriptorSetLayout; + + VulkanDescriptorSetLayout( + const VulkanDescriptorSetLayout &descriptorSetLayout); + void + VulkanDescriptorSetLayoutCommon(const VulkanDescriptorSetLayoutBindingList + &descriptorSetLayoutBindingList); + +public: + VulkanDescriptorSetLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding); + VulkanDescriptorSetLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1); + VulkanDescriptorSetLayout(const VulkanDevice &device, + const VulkanDescriptorSetLayoutBindingList + &descriptorSetLayoutBindingList); + virtual ~VulkanDescriptorSetLayout(); + operator VkDescriptorSetLayout() const; +}; + +class VulkanPipelineLayout { +protected: + const VulkanDevice &m_device; + VkPipelineLayout m_vkPipelineLayout; + + VulkanPipelineLayout(const VulkanPipelineLayout &pipelineLayout); + void VulkanPipelineLayoutCommon( + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList); + +public: + VulkanPipelineLayout(const VulkanDevice &device, + const VulkanDescriptorSetLayout &descriptorSetLayout); + VulkanPipelineLayout( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutList &descriptorSetLayoutList = + getEmptyVulkanDescriptorSetLayoutList()); + virtual ~VulkanPipelineLayout(); + operator VkPipelineLayout() const; +}; + +class VulkanShaderModule { +protected: + const VulkanDevice &m_device; + VkShaderModule m_vkShaderModule; + + VulkanShaderModule(const VulkanShaderModule &shaderModule); + +public: + VulkanShaderModule(const VulkanDevice &device, + const std::vector<char> &code); + virtual ~VulkanShaderModule(); + operator VkShaderModule() const; +}; + +class VulkanPipeline { +protected: + const VulkanDevice &m_device; + VkPipeline m_vkPipeline; + + VulkanPipeline(const VulkanPipeline &pipeline); + +public: + VulkanPipeline(const VulkanDevice &device); + virtual ~VulkanPipeline(); + virtual VulkanPipelineBindPoint getPipelineBindPoint() const = 0; + operator VkPipeline() const; +}; + +class VulkanComputePipeline : public VulkanPipeline { +protected: + VulkanComputePipeline(const VulkanComputePipeline &computePipeline); + +public: + VulkanComputePipeline(const VulkanDevice &device, + const VulkanPipelineLayout &pipelineLayout, + const VulkanShaderModule &shaderModule, + const std::string &entryFuncName = "main"); + virtual ~VulkanComputePipeline(); + VulkanPipelineBindPoint getPipelineBindPoint() const; +}; + +class VulkanDescriptorPool { +protected: + const VulkanDevice &m_device; + VkDescriptorPool m_vkDescriptorPool; + + VulkanDescriptorPool(const VulkanDescriptorPool &descriptorPool); + void VulkanDescriptorPoolCommon(const VulkanDescriptorSetLayoutBindingList + &descriptorSetLayoutBindingList); + +public: + VulkanDescriptorPool( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding); + VulkanDescriptorPool( + const VulkanDevice &device, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding0, + const VulkanDescriptorSetLayoutBinding &descriptorSetLayoutBinding1); + VulkanDescriptorPool(const VulkanDevice &device, + const VulkanDescriptorSetLayoutBindingList + &descriptorSetLayoutBindingList); + virtual ~VulkanDescriptorPool(); + operator VkDescriptorPool() const; +}; + +class VulkanDescriptorSet { +protected: + const VulkanDevice &m_device; + const VulkanDescriptorPool &m_descriptorPool; + VkDescriptorSet m_vkDescriptorSet; + + VulkanDescriptorSet(const VulkanDescriptorSet &descriptorSet); + +public: + VulkanDescriptorSet(const VulkanDevice &device, + const VulkanDescriptorPool &descriptorPool, + const VulkanDescriptorSetLayout &descriptorSetLayout); + virtual ~VulkanDescriptorSet(); + void update(uint32_t binding, const VulkanBuffer &buffer); + void update(uint32_t binding, const VulkanImageView &imageView); + operator VkDescriptorSet() const; +}; + +class VulkanOffset3D { +protected: + VkOffset3D m_vkOffset3D; + +public: + VulkanOffset3D(const VulkanOffset3D &extent3D); + VulkanOffset3D(uint32_t x = 0, uint32_t y = 0, uint32_t z = 0); + virtual ~VulkanOffset3D(); + uint32_t getX() const; + uint32_t getY() const; + uint32_t getZ() const; + operator VkOffset3D() const; +}; + +class VulkanExtent3D { +protected: + VkExtent3D m_vkExtent3D; + +public: + VulkanExtent3D(const VulkanExtent3D &extent3D); + VulkanExtent3D(uint32_t width, uint32_t height = 1, uint32_t depth = 1); + virtual ~VulkanExtent3D(); + uint32_t getWidth() const; + uint32_t getHeight() const; + uint32_t getDepth() const; + operator VkExtent3D() const; +}; + +class VulkanCommandPool { +protected: + const VulkanDevice &m_device; + VkCommandPool m_vkCommandPool; + + VulkanCommandPool(const VulkanCommandPool &commandPool); + +public: + VulkanCommandPool( + const VulkanDevice &device, + const VulkanQueueFamily &queueFamily = getVulkanQueueFamily()); + virtual ~VulkanCommandPool(); + operator VkCommandPool() const; +}; + +class VulkanCommandBuffer { +protected: + const VulkanDevice &m_device; + const VulkanCommandPool &m_commandPool; + VkCommandBuffer m_vkCommandBuffer; + + VulkanCommandBuffer(const VulkanCommandBuffer &commandBuffer); + +public: + VulkanCommandBuffer(const VulkanDevice &device, + const VulkanCommandPool &commandPool); + virtual ~VulkanCommandBuffer(); + void begin(); + void bindPipeline(const VulkanPipeline &pipeline); + void bindDescriptorSets(const VulkanPipeline &pipeline, + const VulkanPipelineLayout &pipelineLayout, + const VulkanDescriptorSet &descriptorSet); + void pipelineBarrier(const VulkanImage2DList &image2DList, + VulkanImageLayout oldImageLayout, + VulkanImageLayout newImageLayout); + void dispatch(uint32_t groupCountX, uint32_t groupCountY, + uint32_t groupCountZ); + void fillBuffer(const VulkanBuffer &buffer, uint32_t data, + uint64_t offset = 0, uint64_t size = VK_WHOLE_SIZE); + void updateBuffer(const VulkanBuffer &buffer, void *pdata, + uint64_t offset = 0, uint64_t size = VK_WHOLE_SIZE); + void copyBufferToImage(const VulkanBuffer &buffer, const VulkanImage &image, + VulkanImageLayout imageLayout = + VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + void copyBufferToImage(const VulkanBuffer &buffer, const VulkanImage &image, + uint64_t bufferOffset = 0, uint32_t mipLevel = 0, + uint32_t baseArrayLayer = 0, uint32_t layerCount = 1, + VulkanOffset3D offset3D = VulkanOffset3D(0, 0, 0), + VulkanExtent3D extent3D = VulkanExtent3D(0, 0, 0)); + void copyImageToBuffer(const VulkanImage &image, const VulkanBuffer &buffer, + uint64_t bufferOffset = 0, uint32_t mipLevel = 0, + uint32_t baseArrayLayer = 0, uint32_t layerCount = 1, + VulkanOffset3D offset3D = VulkanOffset3D(0, 0, 0), + VulkanExtent3D extent3D = VulkanExtent3D(0, 0, 0)); + void end(); + operator VkCommandBuffer() const; +}; + +class VulkanBuffer { +protected: + const VulkanDevice &m_device; + VkBuffer m_vkBuffer; + uint64_t m_size; + uint64_t m_alignment; + VulkanMemoryTypeList m_memoryTypeList; + + VulkanBuffer(const VulkanBuffer &buffer); + +public: + VulkanBuffer(const VulkanDevice &device, uint64_t size, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanBufferUsage bufferUsage = + VULKAN_BUFFER_USAGE_STORAGE_BUFFER_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE, + const VulkanQueueFamilyList &queueFamilyList = + getEmptyVulkanQueueFamilyList()); + virtual ~VulkanBuffer(); + uint64_t getSize() const; + uint64_t getAlignment() const; + const VulkanMemoryTypeList &getMemoryTypeList() const; + operator VkBuffer() const; +}; + +class VulkanImage { +protected: + const VulkanDevice &m_device; + const VulkanImageType m_imageType; + const VulkanExtent3D m_extent3D; + const VulkanFormat m_format; + const uint32_t m_numMipLevels; + const uint32_t m_numLayers; + VkImage m_vkImage; + uint64_t m_size; + uint64_t m_alignment; + VulkanMemoryTypeList m_memoryTypeList; + VkImageCreateInfo VulkanImageCreateInfo; + VulkanImage(const VulkanImage &image); + +public: + VulkanImage( + const VulkanDevice &device, VulkanImageType imageType, + VulkanFormat format, const VulkanExtent3D &extent3D, + uint32_t numMipLevels = 1, uint32_t arrayLayers = 1, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanImageCreateFlag imageCreateFlags = VULKAN_IMAGE_CREATE_FLAG_NONE, + VulkanImageTiling imageTiling = VULKAN_IMAGE_TILING_OPTIMAL, + VulkanImageUsage imageUsage = + VULKAN_IMAGE_USAGE_SAMPLED_STORAGE_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE); + virtual ~VulkanImage(); + virtual VulkanExtent3D getExtent3D(uint32_t mipLevel = 0) const; + VulkanFormat getFormat() const; + uint32_t getNumMipLevels() const; + uint32_t getNumLayers() const; + uint64_t getSize() const; + uint64_t getAlignment() const; + const VulkanMemoryTypeList &getMemoryTypeList() const; + VkImageCreateInfo getVkImageCreateInfo() const; + operator VkImage() const; +}; + +class VulkanImage2D : public VulkanImage { +protected: + VkImageView m_vkImageView; + + VulkanImage2D(const VulkanImage2D &image2D); + +public: + VulkanImage2D( + const VulkanDevice &device, VulkanFormat format, uint32_t width, + uint32_t height, uint32_t numMipLevels = 1, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + VulkanImageCreateFlag imageCreateFlag = VULKAN_IMAGE_CREATE_FLAG_NONE, + VulkanImageUsage imageUsage = + VULKAN_IMAGE_USAGE_SAMPLED_STORAGE_TRANSFER_SRC_DST, + VulkanSharingMode sharingMode = VULKAN_SHARING_MODE_EXCLUSIVE); + virtual ~VulkanImage2D(); + virtual VulkanExtent3D getExtent3D(uint32_t mipLevel = 0) const; +}; + +class VulkanImageView { +protected: + const VulkanDevice &m_device; + VkImageView m_vkImageView; + + VulkanImageView(const VulkanImageView &imageView); + +public: + VulkanImageView(const VulkanDevice &device, const VulkanImage &image, + VulkanImageViewType imageViewType, + uint32_t baseMipLevel = 0, + uint32_t mipLevelCount = VULKAN_REMAINING_MIP_LEVELS, + uint32_t baseArrayLayer = 0, + uint32_t layerCount = VULKAN_REMAINING_ARRAY_LAYERS); + virtual ~VulkanImageView(); + operator VkImageView() const; +}; + +class VulkanDeviceMemory { +protected: + const VulkanDevice &m_device; + VkDeviceMemory m_vkDeviceMemory; + uint64_t m_size; + bool m_isDedicated; + + VulkanDeviceMemory(const VulkanDeviceMemory &deviceMemory); + +public: + VulkanDeviceMemory(const VulkanDevice &device, uint64_t size, + const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + const void *name = NULL); + VulkanDeviceMemory(const VulkanDevice &device, const VulkanImage &image, + const VulkanMemoryType &memoryType, + VulkanExternalMemoryHandleType externalMemoryHandleType = + VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE, + const void *name = NULL); + virtual ~VulkanDeviceMemory(); + uint64_t getSize() const; +#ifdef _WIN32 + HANDLE + getHandle(VulkanExternalMemoryHandleType externalMemoryHandleType) const; +#else + int + getHandle(VulkanExternalMemoryHandleType externalMemoryHandleType) const; +#endif + bool isDedicated() const; + void *map(size_t offset = 0, size_t size = VK_WHOLE_SIZE); + void unmap(); + void bindBuffer(const VulkanBuffer &buffer, uint64_t offset = 0); + void bindImage(const VulkanImage &image, uint64_t offset = 0); + operator VkDeviceMemory() const; +}; + +class VulkanSemaphore { + friend class VulkanQueue; + +protected: + const VulkanDevice &m_device; + VkSemaphore m_vkSemaphore; + const std::wstring m_name; + + VulkanSemaphore(const VulkanSemaphore &semaphore); + +public: + VulkanSemaphore( + const VulkanDevice &device, + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType = + VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE, + const std::wstring name = L""); + virtual ~VulkanSemaphore(); +#ifdef _WIN32 + HANDLE getHandle( + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const; +#else + int getHandle( + VulkanExternalSemaphoreHandleType externalSemaphoreHandleType) const; +#endif + const std::wstring &getName() const; + operator VkSemaphore() const; +}; + + +#define VK_FUNC_DECL(name) extern "C" PFN_##name _##name; +VK_FUNC_LIST +#if defined(_WIN32) || defined(_WIN64) +VK_WINDOWS_FUNC_LIST +#endif +#undef VK_FUNC_DECL + +#endif // _vulkan_wrapper_hpp_ diff --git a/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp new file mode 100644 index 00000000..359bcae4 --- /dev/null +++ b/test_conformance/vulkan/vulkan_interop_common/vulkan_wrapper_types.hpp @@ -0,0 +1,463 @@ +//
+// Copyright (c) 2022 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef _vulkan_wrapper_types_hpp_
+#define _vulkan_wrapper_types_hpp_
+
+#include <vulkan/vulkan.h>
+
+#define VULKAN_MIN_BUFFER_OFFSET_COPY_ALIGNMENT 4
+#define VULKAN_REMAINING_MIP_LEVELS VK_REMAINING_MIP_LEVELS
+#define VULKAN_REMAINING_ARRAY_LAYERS VK_REMAINING_ARRAY_LAYERS
+
+class VulkanInstance;
+class VulkanPhysicalDevice;
+class VulkanMemoryHeap;
+class VulkanMemoryType;
+class VulkanQueueFamily;
+class VulkanDevice;
+class VulkanQueue;
+class VulkanDescriptorSetLayoutBinding;
+class VulkanDescriptorSetLayout;
+class VulkanPipelineLayout;
+class VulkanShaderModule;
+class VulkanPipeline;
+class VulkanComputePipeline;
+class VulkanDescriptorPool;
+class VulkanDescriptorSet;
+class VulkanCommandPool;
+class VulkanCommandBuffer;
+class VulkanBuffer;
+class VulkanOffset3D;
+class VulkanExtent3D;
+class VulkanImage;
+class VulkanImage2D;
+class VulkanImageView;
+class VulkanDeviceMemory;
+class VulkanSemaphore;
+
+class VulkanPhysicalDeviceList;
+class VulkanMemoryHeapList;
+class VulkanMemoryTypeList;
+class VulkanQueueFamilyList;
+class VulkanQueueFamilyToQueueCountMap;
+class VulkanQueueFamilyToQueueListMap;
+class VulkanQueueList;
+class VulkanCommandBufferList;
+class VulkanDescriptorSetLayoutList;
+class VulkanBufferList;
+class VulkanImage2DList;
+class VulkanImageViewList;
+class VulkanDeviceMemoryList;
+class VulkanSemaphoreList;
+
+enum VulkanQueueFlag
+{
+ VULKAN_QUEUE_FLAG_GRAPHICS = VK_QUEUE_GRAPHICS_BIT,
+ VULKAN_QUEUE_FLAG_COMPUTE = VK_QUEUE_COMPUTE_BIT,
+ VULKAN_QUEUE_FLAG_TRANSFER = VK_QUEUE_TRANSFER_BIT,
+ VULKAN_QUEUE_FLAG_MASK_ALL = VULKAN_QUEUE_FLAG_GRAPHICS
+ | VULKAN_QUEUE_FLAG_COMPUTE | VULKAN_QUEUE_FLAG_TRANSFER
+};
+
+enum VulkanDescriptorType
+{
+ VULKAN_DESCRIPTOR_TYPE_SAMPLER = VK_DESCRIPTOR_TYPE_SAMPLER,
+ VULKAN_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER =
+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+ VULKAN_DESCRIPTOR_TYPE_SAMPLED_IMAGE = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+ VULKAN_DESCRIPTOR_TYPE_STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+ VULKAN_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER =
+ VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+ VULKAN_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER =
+ VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER,
+ VULKAN_DESCRIPTOR_TYPE_UNIFORM_BUFFER = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+ VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+ VULKAN_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC =
+ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
+ VULKAN_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC =
+ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
+ VULKAN_DESCRIPTOR_TYPE_INPUT_ATTACHMENT =
+ VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT,
+};
+
+enum VulkanShaderStage
+{
+ VULKAN_SHADER_STAGE_VERTEX = VK_SHADER_STAGE_VERTEX_BIT,
+ VULKAN_SHADER_STAGE_FRAGMENT = VK_SHADER_STAGE_FRAGMENT_BIT,
+ VULKAN_SHADER_STAGE_COMPUTE = VK_SHADER_STAGE_COMPUTE_BIT,
+ VULKAN_SHADER_STAGE_ALL_GRAPHICS = VK_SHADER_STAGE_ALL_GRAPHICS,
+ VULKAN_SHADER_STAGE_ALL = VK_SHADER_STAGE_ALL
+};
+
+enum VulkanPipelineBindPoint
+{
+ VULKAN_PIPELINE_BIND_POINT_GRAPHICS = VK_PIPELINE_BIND_POINT_GRAPHICS,
+ VULKAN_PIPELINE_BIND_POINT_COMPUTE = VK_PIPELINE_BIND_POINT_COMPUTE
+};
+
+enum VulkanMemoryTypeProperty
+{
+ VULKAN_MEMORY_TYPE_PROPERTY_NONE = 0,
+ VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL =
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_COHERENT =
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+ | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_CACHED =
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+ | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_HOST_VISIBLE_CACHED_COHERENT =
+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT
+ | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_COHERENT =
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+ | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+ | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_CACHED =
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+ | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+ | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+ VULKAN_MEMORY_TYPE_PROPERTY_DEVICE_LOCAL_HOST_VISIBLE_CACHED_COHERENT =
+ VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT
+ | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
+ | VK_MEMORY_PROPERTY_HOST_CACHED_BIT
+ | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
+};
+
+enum VulkanMemoryHeapFlag
+{
+ VULKAN_MEMORY_HEAP_FLAG_NONE = 0,
+ VULKAN_MEMORY_HEAP_FLAG_DEVICE_LOCAL = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT
+};
+
+enum VulkanExternalMemoryHandleType
+{
+ VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_NONE = 0,
+ VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
+ VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR,
+ VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
+ VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT =
+ VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
+ | VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR
+};
+
+enum VulkanExternalSemaphoreHandleType
+{
+ VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NONE = 0,
+ VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD =
+ VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
+ VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT =
+ VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR,
+ VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT =
+ VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR,
+ VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT_KMT =
+ VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR
+ | VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR
+};
+
+enum VulkanBufferUsage
+{
+ VULKAN_BUFFER_USAGE_TRANSFER_SRC = VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+ VULKAN_BUFFER_USAGE_TRANSFER_DST = VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ VULKAN_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER =
+ VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_STORAGE_TEXEL_BUFFER =
+ VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_UNIFORM_BUFFER = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_STORAGE_BUFFER = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_INDEX_BUFFER = VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_VERTEX_BUFFER = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_INDIRECT_BUFFER = VK_BUFFER_USAGE_INDIRECT_BUFFER_BIT,
+ VULKAN_BUFFER_USAGE_STORAGE_BUFFER_TRANSFER_SRC_DST =
+ VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT
+ | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+ VULKAN_BUFFER_USAGE_UNIFORM_BUFFER_TRANSFER_SRC_DST =
+ VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT
+ | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+};
+
+enum VulkanSharingMode
+{
+ VULKAN_SHARING_MODE_EXCLUSIVE = VK_SHARING_MODE_EXCLUSIVE,
+ VULKAN_SHARING_MODE_CONCURRENT = VK_SHARING_MODE_CONCURRENT
+};
+
+enum VulkanImageType
+{
+ VULKAN_IMAGE_TYPE_1D = VK_IMAGE_TYPE_1D,
+ VULKAN_IMAGE_TYPE_2D = VK_IMAGE_TYPE_2D,
+ VULKAN_IMAGE_TYPE_3D = VK_IMAGE_TYPE_3D
+};
+
+enum VulkanFormat
+{
+ VULKAN_FORMAT_UNDEFINED = VK_FORMAT_UNDEFINED,
+ VULKAN_FORMAT_R4G4_UNORM_PACK8 = VK_FORMAT_R4G4_UNORM_PACK8,
+ VULKAN_FORMAT_R4G4B4A4_UNORM_PACK16 = VK_FORMAT_R4G4B4A4_UNORM_PACK16,
+ VULKAN_FORMAT_B4G4R4A4_UNORM_PACK16 = VK_FORMAT_B4G4R4A4_UNORM_PACK16,
+ VULKAN_FORMAT_R5G6B5_UNORM_PACK16 = VK_FORMAT_R5G6B5_UNORM_PACK16,
+ VULKAN_FORMAT_B5G6R5_UNORM_PACK16 = VK_FORMAT_B5G6R5_UNORM_PACK16,
+ VULKAN_FORMAT_R5G5B5A1_UNORM_PACK16 = VK_FORMAT_R5G5B5A1_UNORM_PACK16,
+ VULKAN_FORMAT_B5G5R5A1_UNORM_PACK16 = VK_FORMAT_B5G5R5A1_UNORM_PACK16,
+ VULKAN_FORMAT_A1R5G5B5_UNORM_PACK16 = VK_FORMAT_A1R5G5B5_UNORM_PACK16,
+ VULKAN_FORMAT_R8_UNORM = VK_FORMAT_R8_UNORM,
+ VULKAN_FORMAT_R8_SNORM = VK_FORMAT_R8_SNORM,
+ VULKAN_FORMAT_R8_USCALED = VK_FORMAT_R8_USCALED,
+ VULKAN_FORMAT_R8_SSCALED = VK_FORMAT_R8_SSCALED,
+ VULKAN_FORMAT_R8_UINT = VK_FORMAT_R8_UINT,
+ VULKAN_FORMAT_R8_SINT = VK_FORMAT_R8_SINT,
+ VULKAN_FORMAT_R8_SRGB = VK_FORMAT_R8_SRGB,
+ VULKAN_FORMAT_R8G8_SNORM = VK_FORMAT_R8G8_SNORM,
+ VULKAN_FORMAT_R8G8_UNORM = VK_FORMAT_R8G8_UNORM,
+ VULKAN_FORMAT_R8G8_USCALED = VK_FORMAT_R8G8_USCALED,
+ VULKAN_FORMAT_R8G8_SSCALED = VK_FORMAT_R8G8_SSCALED,
+ VULKAN_FORMAT_R8G8_UINT = VK_FORMAT_R8G8_UINT,
+ VULKAN_FORMAT_R8G8_SINT = VK_FORMAT_R8G8_SINT,
+ VULKAN_FORMAT_R8G8_SRGB = VK_FORMAT_R8G8_SRGB,
+ VULKAN_FORMAT_R8G8B8_UNORM = VK_FORMAT_R8G8B8_UNORM,
+ VULKAN_FORMAT_R8G8B8_SNORM = VK_FORMAT_R8G8B8_SNORM,
+ VULKAN_FORMAT_R8G8B8_USCALED = VK_FORMAT_R8G8B8_USCALED,
+ VULKAN_FORMAT_R8G8B8_SSCALED = VK_FORMAT_R8G8B8_SSCALED,
+ VULKAN_FORMAT_R8G8B8_UINT = VK_FORMAT_R8G8B8_UINT,
+ VULKAN_FORMAT_R8G8B8_SINT = VK_FORMAT_R8G8B8_SINT,
+ VULKAN_FORMAT_R8G8B8_SRGB = VK_FORMAT_R8G8B8_SRGB,
+ VULKAN_FORMAT_B8G8R8_UNORM = VK_FORMAT_B8G8R8_UNORM,
+ VULKAN_FORMAT_B8G8R8_SNORM = VK_FORMAT_B8G8R8_SNORM,
+ VULKAN_FORMAT_B8G8R8_USCALED = VK_FORMAT_B8G8R8_USCALED,
+ VULKAN_FORMAT_B8G8R8_SSCALED = VK_FORMAT_B8G8R8_SSCALED,
+ VULKAN_FORMAT_B8G8R8_UINT = VK_FORMAT_B8G8R8_UINT,
+ VULKAN_FORMAT_B8G8R8_SINT = VK_FORMAT_B8G8R8_SINT,
+ VULKAN_FORMAT_B8G8R8_SRGB = VK_FORMAT_B8G8R8_SRGB,
+ VULKAN_FORMAT_R8G8B8A8_UNORM = VK_FORMAT_R8G8B8A8_UNORM,
+ VULKAN_FORMAT_R8G8B8A8_SNORM = VK_FORMAT_R8G8B8A8_SNORM,
+ VULKAN_FORMAT_R8G8B8A8_USCALED = VK_FORMAT_R8G8B8A8_USCALED,
+ VULKAN_FORMAT_R8G8B8A8_SSCALED = VK_FORMAT_R8G8B8A8_SSCALED,
+ VULKAN_FORMAT_R8G8B8A8_UINT = VK_FORMAT_R8G8B8A8_UINT,
+ VULKAN_FORMAT_R8G8B8A8_SINT = VK_FORMAT_R8G8B8A8_SINT,
+ VULKAN_FORMAT_R8G8B8A8_SRGB = VK_FORMAT_R8G8B8A8_SRGB,
+ VULKAN_FORMAT_B8G8R8A8_UNORM = VK_FORMAT_B8G8R8A8_UNORM,
+ VULKAN_FORMAT_B8G8R8A8_SNORM = VK_FORMAT_B8G8R8A8_SNORM,
+ VULKAN_FORMAT_B8G8R8A8_USCALED = VK_FORMAT_B8G8R8A8_USCALED,
+ VULKAN_FORMAT_B8G8R8A8_SSCALED = VK_FORMAT_B8G8R8A8_SSCALED,
+ VULKAN_FORMAT_B8G8R8A8_UINT = VK_FORMAT_B8G8R8A8_UINT,
+ VULKAN_FORMAT_B8G8R8A8_SINT = VK_FORMAT_B8G8R8A8_SINT,
+ VULKAN_FORMAT_B8G8R8A8_SRGB = VK_FORMAT_B8G8R8A8_SRGB,
+ VULKAN_FORMAT_A8B8G8R8_UNORM_PACK32 = VK_FORMAT_A8B8G8R8_UNORM_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_SNORM_PACK32 = VK_FORMAT_A8B8G8R8_SNORM_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_USCALED_PACK32 = VK_FORMAT_A8B8G8R8_USCALED_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_SSCALED_PACK32 = VK_FORMAT_A8B8G8R8_SSCALED_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_UINT_PACK32 = VK_FORMAT_A8B8G8R8_UINT_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_SINT_PACK32 = VK_FORMAT_A8B8G8R8_SINT_PACK32,
+ VULKAN_FORMAT_A8B8G8R8_SRGB_PACK32 = VK_FORMAT_A8B8G8R8_SRGB_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_UNORM_PACK32 = VK_FORMAT_A2R10G10B10_UNORM_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_SNORM_PACK32 = VK_FORMAT_A2R10G10B10_SNORM_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_USCALED_PACK32 =
+ VK_FORMAT_A2R10G10B10_USCALED_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_SSCALED_PACK32 =
+ VK_FORMAT_A2R10G10B10_SSCALED_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_UINT_PACK32 = VK_FORMAT_A2R10G10B10_UINT_PACK32,
+ VULKAN_FORMAT_A2R10G10B10_SINT_PACK32 = VK_FORMAT_A2R10G10B10_SINT_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_UNORM_PACK32 = VK_FORMAT_A2B10G10R10_UNORM_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_SNORM_PACK32 = VK_FORMAT_A2B10G10R10_SNORM_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_USCALED_PACK32 =
+ VK_FORMAT_A2B10G10R10_USCALED_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_SSCALED_PACK32 =
+ VK_FORMAT_A2B10G10R10_SSCALED_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_UINT_PACK32 = VK_FORMAT_A2B10G10R10_UINT_PACK32,
+ VULKAN_FORMAT_A2B10G10R10_SINT_PACK32 = VK_FORMAT_A2B10G10R10_SINT_PACK32,
+ VULKAN_FORMAT_R16_UNORM = VK_FORMAT_R16_UNORM,
+ VULKAN_FORMAT_R16_SNORM = VK_FORMAT_R16_SNORM,
+ VULKAN_FORMAT_R16_USCALED = VK_FORMAT_R16_USCALED,
+ VULKAN_FORMAT_R16_SSCALED = VK_FORMAT_R16_SSCALED,
+ VULKAN_FORMAT_R16_UINT = VK_FORMAT_R16_UINT,
+ VULKAN_FORMAT_R16_SINT = VK_FORMAT_R16_SINT,
+ VULKAN_FORMAT_R16_SFLOAT = VK_FORMAT_R16_SFLOAT,
+ VULKAN_FORMAT_R16G16_UNORM = VK_FORMAT_R16G16_UNORM,
+ VULKAN_FORMAT_R16G16_SNORM = VK_FORMAT_R16G16_SNORM,
+ VULKAN_FORMAT_R16G16_USCALED = VK_FORMAT_R16G16_USCALED,
+ VULKAN_FORMAT_R16G16_SSCALED = VK_FORMAT_R16G16_SSCALED,
+ VULKAN_FORMAT_R16G16_UINT = VK_FORMAT_R16G16_UINT,
+ VULKAN_FORMAT_R16G16_SINT = VK_FORMAT_R16G16_SINT,
+ VULKAN_FORMAT_R16G16_SFLOAT = VK_FORMAT_R16G16_SFLOAT,
+ VULKAN_FORMAT_R16G16B16_UNORM = VK_FORMAT_R16G16B16_UNORM,
+ VULKAN_FORMAT_R16G16B16_SNORM = VK_FORMAT_R16G16B16_SNORM,
+ VULKAN_FORMAT_R16G16B16_USCALED = VK_FORMAT_R16G16B16_USCALED,
+ VULKAN_FORMAT_R16G16B16_SSCALED = VK_FORMAT_R16G16B16_SSCALED,
+ VULKAN_FORMAT_R16G16B16_UINT = VK_FORMAT_R16G16B16_UINT,
+ VULKAN_FORMAT_R16G16B16_SINT = VK_FORMAT_R16G16B16_SINT,
+ VULKAN_FORMAT_R16G16B16_SFLOAT = VK_FORMAT_R16G16B16_SFLOAT,
+ VULKAN_FORMAT_R16G16B16A16_UNORM = VK_FORMAT_R16G16B16A16_UNORM,
+ VULKAN_FORMAT_R16G16B16A16_SNORM = VK_FORMAT_R16G16B16A16_SNORM,
+ VULKAN_FORMAT_R16G16B16A16_USCALED = VK_FORMAT_R16G16B16A16_USCALED,
+ VULKAN_FORMAT_R16G16B16A16_SSCALED = VK_FORMAT_R16G16B16A16_SSCALED,
+ VULKAN_FORMAT_R16G16B16A16_UINT = VK_FORMAT_R16G16B16A16_UINT,
+ VULKAN_FORMAT_R16G16B16A16_SINT = VK_FORMAT_R16G16B16A16_SINT,
+ VULKAN_FORMAT_R16G16B16A16_SFLOAT = VK_FORMAT_R16G16B16A16_SFLOAT,
+ VULKAN_FORMAT_R32_UINT = VK_FORMAT_R32_UINT,
+ VULKAN_FORMAT_R32_SINT = VK_FORMAT_R32_SINT,
+ VULKAN_FORMAT_R32_SFLOAT = VK_FORMAT_R32_SFLOAT,
+ VULKAN_FORMAT_R32G32_UINT = VK_FORMAT_R32G32_UINT,
+ VULKAN_FORMAT_R32G32_SINT = VK_FORMAT_R32G32_SINT,
+ VULKAN_FORMAT_R32G32_SFLOAT = VK_FORMAT_R32G32_SFLOAT,
+ VULKAN_FORMAT_R32G32B32_UINT = VK_FORMAT_R32G32B32_UINT,
+ VULKAN_FORMAT_R32G32B32_SINT = VK_FORMAT_R32G32B32_SINT,
+ VULKAN_FORMAT_R32G32B32_SFLOAT = VK_FORMAT_R32G32B32_SFLOAT,
+ VULKAN_FORMAT_R32G32B32A32_UINT = VK_FORMAT_R32G32B32A32_UINT,
+ VULKAN_FORMAT_R32G32B32A32_SINT = VK_FORMAT_R32G32B32A32_SINT,
+ VULKAN_FORMAT_R32G32B32A32_SFLOAT = VK_FORMAT_R32G32B32A32_SFLOAT,
+ VULKAN_FORMAT_R64_UINT = VK_FORMAT_R64_UINT,
+ VULKAN_FORMAT_R64_SINT = VK_FORMAT_R64_SINT,
+ VULKAN_FORMAT_R64_SFLOAT = VK_FORMAT_R64_SFLOAT,
+ VULKAN_FORMAT_R64G64_UINT = VK_FORMAT_R64G64_UINT,
+ VULKAN_FORMAT_R64G64_SINT = VK_FORMAT_R64G64_SINT,
+ VULKAN_FORMAT_R64G64_SFLOAT = VK_FORMAT_R64G64_SFLOAT,
+ VULKAN_FORMAT_R64G64B64_UINT = VK_FORMAT_R64G64B64_UINT,
+ VULKAN_FORMAT_R64G64B64_SINT = VK_FORMAT_R64G64B64_SINT,
+ VULKAN_FORMAT_R64G64B64_SFLOAT = VK_FORMAT_R64G64B64_SFLOAT,
+ VULKAN_FORMAT_R64G64B64A64_UINT = VK_FORMAT_R64G64B64A64_UINT,
+ VULKAN_FORMAT_R64G64B64A64_SINT = VK_FORMAT_R64G64B64A64_SINT,
+ VULKAN_FORMAT_R64G64B64A64_SFLOAT = VK_FORMAT_R64G64B64A64_SFLOAT,
+ VULKAN_FORMAT_B10G11R11_UFLOAT_PACK32 = VK_FORMAT_B10G11R11_UFLOAT_PACK32,
+ VULKAN_FORMAT_E5B9G9R9_UFLOAT_PACK32 = VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
+ VULKAN_FORMAT_D16_UNORM = VK_FORMAT_D16_UNORM,
+ VULKAN_FORMAT_X8_D24_UNORM_PACK32 = VK_FORMAT_X8_D24_UNORM_PACK32,
+ VULKAN_FORMAT_D32_SFLOAT = VK_FORMAT_D32_SFLOAT,
+ VULKAN_FORMAT_S8_UINT = VK_FORMAT_S8_UINT,
+ VULKAN_FORMAT_D16_UNORM_S8_UINT = VK_FORMAT_D16_UNORM_S8_UINT,
+ VULKAN_FORMAT_D24_UNORM_S8_UINT = VK_FORMAT_D24_UNORM_S8_UINT,
+ VULKAN_FORMAT_D32_SFLOAT_S8_UINT = VK_FORMAT_D32_SFLOAT_S8_UINT,
+ VULKAN_FORMAT_BC1_RGB_UNORM_BLOCK = VK_FORMAT_BC1_RGB_UNORM_BLOCK,
+ VULKAN_FORMAT_BC1_RGB_SRGB_BLOCK = VK_FORMAT_BC1_RGB_SRGB_BLOCK,
+ VULKAN_FORMAT_BC1_RGBA_UNORM_BLOCK = VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
+ VULKAN_FORMAT_BC1_RGBA_SRGB_BLOCK = VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
+ VULKAN_FORMAT_BC2_UNORM_BLOCK = VK_FORMAT_BC2_UNORM_BLOCK,
+ VULKAN_FORMAT_BC2_SRGB_BLOCK = VK_FORMAT_BC2_SRGB_BLOCK,
+ VULKAN_FORMAT_BC3_UNORM_BLOCK = VK_FORMAT_BC3_UNORM_BLOCK,
+ VULKAN_FORMAT_BC3_SRGB_BLOCK = VK_FORMAT_BC3_SRGB_BLOCK,
+ VULKAN_FORMAT_BC4_UNORM_BLOCK = VK_FORMAT_BC4_UNORM_BLOCK,
+ VULKAN_FORMAT_BC4_SNORM_BLOCK = VK_FORMAT_BC4_SNORM_BLOCK,
+ VULKAN_FORMAT_BC5_UNORM_BLOCK = VK_FORMAT_BC5_UNORM_BLOCK,
+ VULKAN_FORMAT_BC5_SNORM_BLOCK = VK_FORMAT_BC5_SNORM_BLOCK,
+ VULKAN_FORMAT_BC6H_UFLOAT_BLOCK = VK_FORMAT_BC6H_UFLOAT_BLOCK,
+ VULKAN_FORMAT_BC6H_SFLOAT_BLOCK = VK_FORMAT_BC6H_SFLOAT_BLOCK,
+ VULKAN_FORMAT_BC7_UNORM_BLOCK = VK_FORMAT_BC7_UNORM_BLOCK,
+ VULKAN_FORMAT_BC7_SRGB_BLOCK = VK_FORMAT_BC7_SRGB_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8_UNORM_BLOCK = VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8_SRGB_BLOCK = VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK =
+ VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK = VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK =
+ VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK,
+ VULKAN_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK = VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK,
+ VULKAN_FORMAT_EAC_R11_UNORM_BLOCK = VK_FORMAT_EAC_R11_UNORM_BLOCK,
+ VULKAN_FORMAT_EAC_R11_SNORM_BLOCK = VK_FORMAT_EAC_R11_SNORM_BLOCK,
+ VULKAN_FORMAT_EAC_R11G11_UNORM_BLOCK = VK_FORMAT_EAC_R11G11_UNORM_BLOCK,
+ VULKAN_FORMAT_EAC_R11G11_SNORM_BLOCK = VK_FORMAT_EAC_R11G11_SNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_4x4_UNORM_BLOCK = VK_FORMAT_ASTC_4x4_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_4x4_SRGB_BLOCK = VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_5x4_UNORM_BLOCK = VK_FORMAT_ASTC_5x4_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_5x4_SRGB_BLOCK = VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_5x5_UNORM_BLOCK = VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_5x5_SRGB_BLOCK = VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_6x5_UNORM_BLOCK = VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_6x5_SRGB_BLOCK = VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_6x6_UNORM_BLOCK = VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_6x6_SRGB_BLOCK = VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_8x5_UNORM_BLOCK = VK_FORMAT_ASTC_8x5_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_8x5_SRGB_BLOCK = VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_8x6_UNORM_BLOCK = VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_8x6_SRGB_BLOCK = VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_8x8_UNORM_BLOCK = VK_FORMAT_ASTC_8x8_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_8x8_SRGB_BLOCK = VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_10x5_UNORM_BLOCK = VK_FORMAT_ASTC_10x5_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_10x5_SRGB_BLOCK = VK_FORMAT_ASTC_10x5_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_10x6_UNORM_BLOCK = VK_FORMAT_ASTC_10x6_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_10x6_SRGB_BLOCK = VK_FORMAT_ASTC_10x6_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_10x8_UNORM_BLOCK = VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_10x8_SRGB_BLOCK = VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_10x10_UNORM_BLOCK = VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_10x10_SRGB_BLOCK = VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_12x10_UNORM_BLOCK = VK_FORMAT_ASTC_12x10_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_12x10_SRGB_BLOCK = VK_FORMAT_ASTC_12x10_SRGB_BLOCK,
+ VULKAN_FORMAT_ASTC_12x12_UNORM_BLOCK = VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+ VULKAN_FORMAT_ASTC_12x12_SRGB_BLOCK = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
+};
+
+enum VulkanImageLayout
+{
+ VULKAN_IMAGE_LAYOUT_UNDEFINED = VK_IMAGE_LAYOUT_UNDEFINED,
+ VULKAN_IMAGE_LAYOUT_GENERAL = VK_IMAGE_LAYOUT_GENERAL,
+ VULKAN_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL =
+ VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+ VULKAN_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL =
+ VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+};
+
+enum VulkanImageUsage
+{
+ VULKAN_IMAGE_USAGE_TRANSFER_SRC = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
+ VULKAN_IMAGE_USAGE_TRANSFER_DST = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ VULKAN_IMAGE_USAGE_SAMPLED = VK_IMAGE_USAGE_SAMPLED_BIT,
+ VULKAN_IMAGE_USAGE_STORAGE = VK_IMAGE_USAGE_STORAGE_BIT,
+ VULKAN_IMAGE_USAGE_COLOR_ATTACHMENT = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+ VULKAN_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT =
+ VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+ VULKAN_IMAGE_USAGE_TRANSIENT_ATTACHMENT =
+ VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT,
+ VULKAN_IMAGE_USAGE_INPUT_ATTACHMENT = VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
+ VULKAN_IMAGE_USAGE_TRANSFER_SRC_DST =
+ VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+ VULKAN_IMAGE_USAGE_STORAGE_TRANSFER_SRC_DST = VULKAN_IMAGE_USAGE_STORAGE
+ | VULKAN_IMAGE_USAGE_TRANSFER_SRC | VULKAN_IMAGE_USAGE_TRANSFER_DST,
+ VULKAN_IMAGE_USAGE_SAMPLED_STORAGE_TRANSFER_SRC_DST =
+ VK_IMAGE_USAGE_SAMPLED_BIT | VULKAN_IMAGE_USAGE_STORAGE
+ | VULKAN_IMAGE_USAGE_TRANSFER_SRC | VULKAN_IMAGE_USAGE_TRANSFER_DST
+};
+
+enum VulkanImageTiling
+{
+ VULKAN_IMAGE_TILING_OPTIMAL = VK_IMAGE_TILING_OPTIMAL,
+ VULKAN_IMAGE_TILING_LINEAR = VK_IMAGE_TILING_LINEAR
+};
+
+enum VulkanImageCreateFlag
+{
+ VULKAN_IMAGE_CREATE_FLAG_NONE = 0,
+ VULKAN_IMAGE_CREATE_FLAG_MUTABLE_FORMAT =
+ VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT,
+ VULKAN_IMAGE_CREATE_FLAG_CUBE_COMPATIBLE =
+ VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT,
+ VULKAN_IMAGE_CREATE_FLAG_CUBE_COMPATIBLE_MUTABLE_FORMAT =
+ VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT | VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT
+};
+
+enum VulkanImageViewType
+{
+ VULKAN_IMAGE_VIEW_TYPE_1D = VK_IMAGE_VIEW_TYPE_1D,
+ VULKAN_IMAGE_VIEW_TYPE_2D = VK_IMAGE_VIEW_TYPE_2D,
+ VULKAN_IMAGE_VIEW_TYPE_3D = VK_IMAGE_VIEW_TYPE_3D,
+ VULKAN_IMAGE_VIEW_TYPE_CUBE = VK_IMAGE_VIEW_TYPE_CUBE,
+ VULKAN_IMAGE_VIEW_TYPE_1D_ARRAY = VK_IMAGE_VIEW_TYPE_1D_ARRAY,
+ VULKAN_IMAGE_VIEW_TYPE_2D_ARRAY = VK_IMAGE_VIEW_TYPE_2D_ARRAY,
+ VULKAN_IMAGE_VIEW_TYPE_CUBE_ARRAY = VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
+};
+
+#endif // _vulkan_wrapper_types_hpp_
diff --git a/test_conformance/workgroups/CMakeLists.txt b/test_conformance/workgroups/CMakeLists.txt index 08886086..0c004b32 100644 --- a/test_conformance/workgroups/CMakeLists.txt +++ b/test_conformance/workgroups/CMakeLists.txt @@ -5,15 +5,8 @@ set(${MODULE_NAME}_SOURCES test_wg_all.cpp test_wg_any.cpp test_wg_broadcast.cpp - test_wg_reduce.cpp - test_wg_reduce_max.cpp - test_wg_reduce_min.cpp - test_wg_scan_exclusive_add.cpp - test_wg_scan_exclusive_min.cpp - test_wg_scan_exclusive_max.cpp - test_wg_scan_inclusive_add.cpp - test_wg_scan_inclusive_min.cpp - test_wg_scan_inclusive_max.cpp + test_wg_scan_reduce.cpp + test_wg_suggested_local_work_size.cpp ) include(../CMakeCommon.txt) diff --git a/test_conformance/workgroups/main.cpp b/test_conformance/workgroups/main.cpp index 41ffa741..abb1145b 100644 --- a/test_conformance/workgroups/main.cpp +++ b/test_conformance/workgroups/main.cpp @@ -24,27 +24,30 @@ #endif test_definition test_list[] = { - ADD_TEST(work_group_all), - ADD_TEST(work_group_any), - ADD_TEST(work_group_reduce_add), - ADD_TEST(work_group_reduce_min), - ADD_TEST(work_group_reduce_max), - ADD_TEST(work_group_scan_inclusive_add), - ADD_TEST(work_group_scan_inclusive_min), - ADD_TEST(work_group_scan_inclusive_max), - ADD_TEST(work_group_scan_exclusive_add), - ADD_TEST(work_group_scan_exclusive_min), - ADD_TEST(work_group_scan_exclusive_max), - ADD_TEST(work_group_broadcast_1D), - ADD_TEST(work_group_broadcast_2D), - ADD_TEST(work_group_broadcast_3D), + ADD_TEST_VERSION(work_group_all, Version(2, 0)), + ADD_TEST_VERSION(work_group_any, Version(2, 0)), + ADD_TEST_VERSION(work_group_reduce_add, Version(2, 0)), + ADD_TEST_VERSION(work_group_reduce_min, Version(2, 0)), + ADD_TEST_VERSION(work_group_reduce_max, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_inclusive_add, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_inclusive_min, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_inclusive_max, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_exclusive_add, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_exclusive_min, Version(2, 0)), + ADD_TEST_VERSION(work_group_scan_exclusive_max, Version(2, 0)), + ADD_TEST_VERSION(work_group_broadcast_1D, Version(2, 0)), + ADD_TEST_VERSION(work_group_broadcast_2D, Version(2, 0)), + ADD_TEST_VERSION(work_group_broadcast_3D, Version(2, 0)), + ADD_TEST(work_group_suggested_local_size_1D), + ADD_TEST(work_group_suggested_local_size_2D), + ADD_TEST(work_group_suggested_local_size_3D) }; const int test_num = ARRAY_SIZE(test_list); test_status InitCL(cl_device_id device) { auto version = get_device_cl_version(device); - auto expected_min_version = Version(2, 0); + auto expected_min_version = Version(1, 2); if (version < expected_min_version) { version_expected_info("Test", "OpenCL", diff --git a/test_conformance/workgroups/procs.h b/test_conformance/workgroups/procs.h index 2e6e79e2..6143d525 100644 --- a/test_conformance/workgroups/procs.h +++ b/test_conformance/workgroups/procs.h @@ -1,6 +1,6 @@ // -// Copyright (c) 2017 The Khronos Group Inc. -// +// Copyright (c) 2017, 2021 The Khronos Group Inc. +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,6 +16,7 @@ #include "harness/testHarness.h" #include "harness/kernelHelpers.h" #include "harness/errorHelpers.h" +#include "harness/typeWrappers.h" #include "harness/conversions.h" #include "harness/mt19937.h" @@ -36,3 +37,16 @@ extern int test_work_group_scan_exclusive_max(cl_device_id deviceID, cl_context extern int test_work_group_scan_inclusive_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_work_group_scan_inclusive_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); extern int test_work_group_scan_inclusive_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements); + +extern int test_work_group_suggested_local_size_1D(cl_device_id device, + cl_context context, + cl_command_queue queue, + int n_elems); +extern int test_work_group_suggested_local_size_2D(cl_device_id device, + cl_context context, + cl_command_queue queue, + int n_elems); +extern int test_work_group_suggested_local_size_3D(cl_device_id device, + cl_context context, + cl_command_queue queue, + int n_elems); diff --git a/test_conformance/workgroups/test_wg_broadcast.cpp b/test_conformance/workgroups/test_wg_broadcast.cpp index 35559476..29380211 100644 --- a/test_conformance/workgroups/test_wg_broadcast.cpp +++ b/test_conformance/workgroups/test_wg_broadcast.cpp @@ -20,6 +20,8 @@ #include <sys/types.h> #include <sys/stat.h> +#include <algorithm> + #include "procs.h" @@ -310,7 +312,7 @@ test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command localsize[0] = localsize[1] = 1; } - num_workgroups = MAX(n_elems/wg_size[0], 16); + num_workgroups = std::max(n_elems / wg_size[0], (size_t)16); globalsize[0] = num_workgroups * localsize[0]; globalsize[1] = num_workgroups * localsize[1]; num_elements = globalsize[0] * globalsize[1]; @@ -437,7 +439,7 @@ test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command localsize[0] = localsize[1] = localsize[2] = 1; } - num_workgroups = MAX(n_elems/wg_size[0], 8); + num_workgroups = std::max(n_elems / wg_size[0], (size_t)8); globalsize[0] = num_workgroups * localsize[0]; globalsize[1] = num_workgroups * localsize[1]; globalsize[2] = num_workgroups * localsize[2]; diff --git a/test_conformance/workgroups/test_wg_reduce.cpp b/test_conformance/workgroups/test_wg_reduce.cpp deleted file mode 100644 index eb26f498..00000000 --- a/test_conformance/workgroups/test_wg_reduce.cpp +++ /dev/null @@ -1,596 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_reduce_add_kernel_code_int = -"__kernel void test_wg_reduce_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_add_kernel_code_uint = -"__kernel void test_wg_reduce_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_add_kernel_code_long = -"__kernel void test_wg_reduce_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_add_kernel_code_ulong = -"__kernel void test_wg_reduce_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - int sum = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add int: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - unsigned int sum = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add uint: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_long sum = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add long: Error at %u: expected = %lld, got = %lld\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_ulong sum = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add ulong: Error at %u: expected = %llu, got = %llu\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_add_kernel_code_int, - "test_wg_reduce_add_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_add int failed\n"); - return -1; - } - log_info("work_group_reduce_add int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_add_kernel_code_uint, - "test_wg_reduce_add_uint"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_add uint failed\n"); - return -1; - } - log_info("work_group_reduce_add uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_reduce_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_add_kernel_code_long, - "test_wg_reduce_add_long"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_add long failed\n"); - return -1; - } - log_info("work_group_reduce_add long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_add_kernel_code_ulong, - "test_wg_reduce_add_ulong"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_add ulong failed\n"); - return -1; - } - log_info("work_group_reduce_add ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_reduce_add_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_add_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_add_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_add_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_reduce_max.cpp b/test_conformance/workgroups/test_wg_reduce_max.cpp deleted file mode 100644 index 3bbd3f25..00000000 --- a/test_conformance/workgroups/test_wg_reduce_max.cpp +++ /dev/null @@ -1,632 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_reduce_max_kernel_code_int = -"__kernel void test_wg_reduce_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_max_kernel_code_uint = -"__kernel void test_wg_reduce_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_max_kernel_code_long = -"__kernel void test_wg_reduce_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_max_kernel_code_ulong = -"__kernel void test_wg_reduce_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - int max = CL_INT_MIN; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max int: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - unsigned int max = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max uint: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_long max = CL_LONG_MIN; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max long: Error at %u: expected = %lld, got = %lld\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_ulong max = 0; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max ulong: Error at %u: expected = %llu, got = %llu\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_max_kernel_code_int, - "test_wg_reduce_max_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_max int failed\n"); - return -1; - } - log_info("work_group_reduce_max int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_max_kernel_code_uint, - "test_wg_reduce_max_uint"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_max uint failed\n"); - return -1; - } - log_info("work_group_reduce_max uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_reduce_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_max_kernel_code_long, - "test_wg_reduce_max_long"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_max long failed\n"); - return -1; - } - log_info("work_group_reduce_max long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_max_kernel_code_ulong, - "test_wg_reduce_max_ulong"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_max ulong failed\n"); - return -1; - } - log_info("work_group_reduce_max ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_reduce_max_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_max_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_max_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_max_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_reduce_min.cpp b/test_conformance/workgroups/test_wg_reduce_min.cpp deleted file mode 100644 index 7b1b22e8..00000000 --- a/test_conformance/workgroups/test_wg_reduce_min.cpp +++ /dev/null @@ -1,632 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_reduce_min_kernel_code_int = -"__kernel void test_wg_reduce_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_min_kernel_code_uint = -"__kernel void test_wg_reduce_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_min_kernel_code_long = -"__kernel void test_wg_reduce_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_min_kernel_code_ulong = -"__kernel void test_wg_reduce_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - int min = CL_INT_MAX; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min int: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - unsigned int min = CL_UINT_MAX; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min uint: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_long min = CL_ULONG_MAX; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min long: Error at %u: expected = %lld, got = %lld\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i<n; i+=wg_size) - { - cl_ulong min = CL_ULONG_MAX; - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min ulong: Error at %u: expected = %llu, got = %llu\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_min_kernel_code_int, - "test_wg_reduce_min_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_min int failed\n"); - return -1; - } - log_info("work_group_reduce_min int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_min_kernel_code_uint, - "test_wg_reduce_min_uint"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_min uint failed\n"); - return -1; - } - log_info("work_group_reduce_min uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_reduce_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_min_kernel_code_long, - "test_wg_reduce_min_long"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_min long failed\n"); - return -1; - } - log_info("work_group_reduce_min long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_min_kernel_code_ulong, - "test_wg_reduce_min_ulong"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_reduce_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_reduce_min ulong failed\n"); - return -1; - } - log_info("work_group_reduce_min ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_reduce_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_reduce_min_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_min_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_min_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_reduce_min_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp deleted file mode 100644 index e695a165..00000000 --- a/test_conformance/workgroups/test_wg_scan_exclusive_add.cpp +++ /dev/null @@ -1,604 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_exclusive_add_kernel_code_int = -"__kernel void test_wg_scan_exclusive_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_add_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_add_kernel_code_long = -"__kernel void test_wg_scan_exclusive_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_add_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_exclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - int s, lasts; - - - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add int: Error at %u: expected = %d, got = %d\n", - (unsigned int)(j + i), lasts, outptr[j + i]); - return -1; - } - lasts = s; - } - } - return 0; -} - -static int -verify_wg_scan_exclusive_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - unsigned int s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add uint: Error at %u: expected = %u, got = %u\n", - (unsigned int)(j + i), lasts, outptr[j + i]); - return -1; - } - lasts = s; - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - cl_long s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - s = 0; - - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add long: Error at %u: expected = %lld, got = %lld\n", - (unsigned int)(j + i), (long long)lasts, (long long)outptr[j + i]); - return -1; - } - lasts = s; - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - cl_ulong s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add ulong: Error at %u: expected = %llu, got = %llu\n", - (unsigned int)(j + i), (unsigned long long)lasts, (unsigned long long)outptr[j + i]); - return -1; - } - lasts = s; - } - } - return 0; -} - - -int -test_work_group_scan_exclusive_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_add_kernel_code_int, - "test_wg_scan_exclusive_add_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_add int failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_add int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_add_kernel_code_uint, - "test_wg_scan_exclusive_add_uint"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_add uint failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_add uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_exclusive_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_add_kernel_code_long, - "test_wg_scan_exclusive_add_long"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_add long failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_add long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_add_kernel_code_ulong, - "test_wg_scan_exclusive_add_ulong"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_add ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_exclusive_add_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_add_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_add_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_add_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp deleted file mode 100644 index 12338b68..00000000 --- a/test_conformance/workgroups/test_wg_scan_exclusive_max.cpp +++ /dev/null @@ -1,631 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_exclusive_max_kernel_code_int = -"__kernel void test_wg_scan_exclusive_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_max_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_max_kernel_code_long = -"__kernel void test_wg_scan_exclusive_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_max_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_exclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - int max_ = 0x80000000; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = MAX(inptr[j+i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - unsigned int max_ = 0x0; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = MAX(inptr[j+i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_long max_ = 0x8000000000000000ULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = MAX(inptr[j+i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_ulong max_ = 0x0; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = MAX(inptr[j+i], max_); - } - } - - return 0; -} - - -int -test_work_group_scan_exclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_max_kernel_code_int, - "test_wg_scan_exclusive_max_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_max int failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_max int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_max_kernel_code_uint, - "test_wg_scan_exclusive_max_uint"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_max uint failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_max uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_exclusive_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_max_kernel_code_long, - "test_wg_scan_exclusive_max_long"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_max long failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_max long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_max_kernel_code_ulong, - "test_wg_scan_exclusive_max_ulong"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_max ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_exclusive_max_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_max_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_max_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_max_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp deleted file mode 100644 index f4e6bf97..00000000 --- a/test_conformance/workgroups/test_wg_scan_exclusive_min.cpp +++ /dev/null @@ -1,632 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_exclusive_min_kernel_code_int = -"__kernel void test_wg_scan_exclusive_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_min_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_min_kernel_code_long = -"__kernel void test_wg_scan_exclusive_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_min_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - - -static int -verify_wg_scan_exclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - int min_ = 0x7fffffff; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = MIN(inptr[j+i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - unsigned int min_ = 0xffffffff; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min int: Error at %u: expected = %u, got = %u\n", j+i, min_, outptr[j+i]); - return -1; - } - min_ = MIN(inptr[j+i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_long min_ = 0x7fffffffffffffffULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = MIN(inptr[j+i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_ulong min_ = 0xffffffffffffffffULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = MIN(inptr[j+i], min_); - } - } - - return 0; -} - - -int -test_work_group_scan_exclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_min_kernel_code_int, - "test_wg_scan_exclusive_min_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_min int failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_min int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_min_kernel_code_uint, - "test_wg_scan_exclusive_min_uint"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_min uint failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_min uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_exclusive_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_min_kernel_code_long, - "test_wg_scan_exclusive_min_long"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusive_min long failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_min long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_min_kernel_code_ulong, - "test_wg_scan_exclusive_min_ulong"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_exclusive_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_exclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_exclusive_min ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_exclusive_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_exclusive_min_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_min_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_min_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_exclusive_min_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp deleted file mode 100644 index 51c98a4e..00000000 --- a/test_conformance/workgroups/test_wg_scan_inclusive_add.cpp +++ /dev/null @@ -1,593 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_inclusive_add_kernel_code_int = -"__kernel void test_wg_scan_inclusive_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_add_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_add_kernel_code_long = -"__kernel void test_wg_scan_inclusive_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_add_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j, m; - int s; - - for (j=0; j<n; j+=wg_size) { - m = n - j; - if (m > wg_size) - m = wg_size; - - s = 0; - for (i=0; i<m; ++i) { - s += inptr[j+i]; - if (outptr[j+i] != s) { - log_info("work_group_scan_inclusive_add int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), s, outptr[j+i]); - return -1; - } - } - } - return 0; -} - -static int -verify_wg_scan_inclusive_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j, m; - unsigned int s; - - for (j=0; j<n; j+=wg_size) { - m = n - j; - if (m > wg_size) - m = wg_size; - - s = 0; - for (i=0; i<m; ++i) { - s += inptr[j+i]; - if (outptr[j+i] != s) { - log_info("work_group_scan_inclusive_add uint: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), s, outptr[j+i]); - return -1; - } - } - } - return 0; -} - -static int -verify_wg_scan_inclusive_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j, m; - cl_long s; - - for (j=0; j<n; j+=wg_size) { - m = n - j; - if (m > wg_size) - m = wg_size; - - s = 0; - for (i=0; i<m; ++i) { - s += inptr[j+i]; - if (outptr[j+i] != s) { - log_info("work_group_scan_inclusive_add long: Error at %u: expected = %lld, got = %lld\n", - (unsigned int)(j+i), (long long)s, (long long)outptr[j+i]); - return -1; - } - } - } - return 0; -} - -static int -verify_wg_scan_inclusive_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j, m; - cl_ulong s; - - for (j=0; j<n; j+=wg_size) { - m = n - j; - if (m > wg_size) - m = wg_size; - - s = 0; - for (i=0; i<m; ++i) { - s += inptr[j+i]; - if (outptr[j+i] != s) { - log_info("work_group_scan_inclusive_add int: Error at %u: expected = %llu, got = %llu\n", - (unsigned int)(j+i), (unsigned long long)s, (unsigned long long)outptr[j+i]); - return -1; - } - } - } - return 0; -} - - -int -test_work_group_scan_inclusive_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_add_kernel_code_int, - "test_wg_scan_inclusive_add_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_add_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_add int failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_add int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_add_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_add_kernel_code_uint, - "test_wg_scan_inclusive_add_uint"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_add_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_add uint failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_add uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_inclusive_add_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_add_kernel_code_long, - "test_wg_scan_inclusive_add_long"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_add_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_add long failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_add long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_add_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_add_kernel_code_ulong, - "test_wg_scan_inclusive_add_ulong"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_add_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_add ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_add(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_inclusive_add_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_add_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_add_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_add_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp deleted file mode 100644 index 44ebf805..00000000 --- a/test_conformance/workgroups/test_wg_scan_inclusive_max.cpp +++ /dev/null @@ -1,595 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_inclusive_max_kernel_code_int = -"__kernel void test_wg_scan_inclusive_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_max_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_max_kernel_code_long = -"__kernel void test_wg_scan_inclusive_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_max_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - int max_ = 0x80000000; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = MAX(inptr[j+i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - unsigned int max_ = 0x0; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = MAX(inptr[j+i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max int: Error at %lu: expected = %u, got = %u\n", (unsigned long)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_long max_ = 0x8000000000000000ULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = MAX(inptr[j+i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_ulong max_ = 0x0; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = MAX(inptr[j+i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - - -int -test_work_group_scan_inclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_max_kernel_code_int, - "test_wg_scan_inclusive_max_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_max_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_max int failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_max int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_max_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_max_kernel_code_uint, - "test_wg_scan_inclusive_max_uint"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_max_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_max uint failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_max uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_inclusive_max_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_max_kernel_code_long, - "test_wg_scan_inclusive_max_long"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_max_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_max long failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_max long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_max_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_max_kernel_code_ulong, - "test_wg_scan_inclusive_max_ulong"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_max_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_max ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_max(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_inclusive_max_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_max_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_max_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_max_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp b/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp deleted file mode 100644 index f2f05788..00000000 --- a/test_conformance/workgroups/test_wg_scan_inclusive_min.cpp +++ /dev/null @@ -1,595 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> - -#include "procs.h" - - -const char *wg_scan_inclusive_min_kernel_code_int = -"__kernel void test_wg_scan_inclusive_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_min_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_min_kernel_code_long = -"__kernel void test_wg_scan_inclusive_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_min_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - int min_ = 0x7fffffff; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = MIN(inptr[j+i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - unsigned int min_ = 0xffffffff; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = MIN(inptr[j+i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_long min_ = 0x7fffffffffffffffULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = MIN(inptr[j+i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j<n; j+=wg_size) { - cl_ulong min_ = 0xffffffffffffffffULL; - - m = n - j; - if (m > wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = MIN(inptr[j+i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - - -int -test_work_group_scan_inclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_min_kernel_code_int, - "test_wg_scan_inclusive_min_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_int) * num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)num_elements; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_int)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_min_int(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_min int failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_min int passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_min_uint(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_uint *input_ptr[1], *p; - cl_uint *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_min_kernel_code_uint, - "test_wg_scan_inclusive_min_uint"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - output_ptr = (cl_uint*)malloc(sizeof(cl_uint) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_uint) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int32(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_uint)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_uint)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_uint)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_min_uint(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_min uint failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_min uint passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - -int -test_work_group_scan_inclusive_min_long(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_long *input_ptr[1], *p; - cl_long *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_min_kernel_code_long, - "test_wg_scan_inclusive_min_long"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_long*)malloc(sizeof(cl_long) * num_elements); - output_ptr = (cl_long*)malloc(sizeof(cl_long) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_long) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_long)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_long)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_long)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_min_long(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusive_min long failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_min long passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_min_ulong(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_ulong *input_ptr[1], *p; - cl_ulong *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_min_kernel_code_ulong, - "test_wg_scan_inclusive_min_ulong"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - output_ptr = (cl_ulong*)malloc(sizeof(cl_ulong) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_ulong) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i<num_elements; i++) - p[i] = genrand_int64(d); - free_mtdata(d); d = NULL; - - err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_ulong)*num_elements, (void *)input_ptr[0], 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clWriteArray failed\n"); - return -1; - } - - values[0] = streams[0]; - values[1] = streams[1]; - err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] ); - err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] ); - if (err != CL_SUCCESS) - { - log_error("clSetKernelArgs failed\n"); - return -1; - } - - // Line below is troublesome... - threads[0] = (size_t)n_elems; - err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, wg_size, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueNDRangeKernel failed\n"); - return -1; - } - - cl_uint dead = 0xdeaddead; - memset_pattern4(output_ptr, &dead, sizeof(cl_ulong)*num_elements); - err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_ulong)*num_elements, (void *)output_ptr, 0, NULL, NULL ); - if (err != CL_SUCCESS) - { - log_error("clEnqueueReadBuffer failed\n"); - return -1; - } - - if (verify_wg_scan_inclusive_min_ulong(input_ptr[0], output_ptr, num_elements, wg_size[0])) - { - log_error("work_group_scan_inclusiveadd ulong failed\n"); - return -1; - } - log_info("work_group_scan_inclusive_min ulong passed\n"); - - clReleaseMemObject(streams[0]); - clReleaseMemObject(streams[1]); - clReleaseKernel(kernel); - clReleaseProgram(program); - free(input_ptr[0]); - free(output_ptr); - - return err; -} - - -int -test_work_group_scan_inclusive_min(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - int err; - - err = test_work_group_scan_inclusive_min_int(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_min_uint(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_min_long(device, context, queue, n_elems); - if (err) return err; - err = test_work_group_scan_inclusive_min_ulong(device, context, queue, n_elems); - return err; -} - diff --git a/test_conformance/workgroups/test_wg_scan_reduce.cpp b/test_conformance/workgroups/test_wg_scan_reduce.cpp new file mode 100644 index 00000000..bf4dc89e --- /dev/null +++ b/test_conformance/workgroups/test_wg_scan_reduce.cpp @@ -0,0 +1,456 @@ +// +// Copyright (c) 2017-2022 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "harness/compat.h" + +#include <algorithm> +#include <limits> +#include <vector> + +#include "procs.h" + +static std::string make_kernel_string(const std::string &type, + const std::string &kernelName, + const std::string &func) +{ + // Build a kernel string of the form: + // __kernel void KERNEL_NAME(global TYPE *input, global TYPE *output) { + // int tid = get_global_id(0); + // output[tid] = FUNC(input[tid]); + // } + + std::ostringstream os; + os << "__kernel void " << kernelName << "(global " << type + << " *input, global " << type << " *output) {\n"; + os << " int tid = get_global_id(0);\n"; + os << " output[tid] = " << func << "(input[tid]);\n"; + os << "}\n"; + return os.str(); +} + +template <typename T> struct TestTypeInfo +{ +}; + +template <> struct TestTypeInfo<cl_int> +{ + static constexpr const char *deviceName = "int"; +}; + +template <> struct TestTypeInfo<cl_uint> +{ + static constexpr const char *deviceName = "uint"; +}; + +template <> struct TestTypeInfo<cl_long> +{ + static constexpr const char *deviceName = "long"; +}; + +template <> struct TestTypeInfo<cl_ulong> +{ + static constexpr const char *deviceName = "ulong"; +}; + +template <typename T> struct Add +{ + using Type = T; + static constexpr const char *opName = "add"; + static constexpr T identityValue = 0; + static T combine(T a, T b) { return a + b; } +}; + +template <typename T> struct Max +{ + using Type = T; + static constexpr const char *opName = "max"; + static constexpr T identityValue = std::numeric_limits<T>::min(); + static T combine(T a, T b) { return std::max(a, b); } +}; + +template <typename T> struct Min +{ + using Type = T; + static constexpr const char *opName = "min"; + static constexpr T identityValue = std::numeric_limits<T>::max(); + static T combine(T a, T b) { return std::min(a, b); } +}; + +template <typename C> struct Reduce +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_reduce"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo<Type>::deviceName; + static constexpr const char *kernelName = "test_wg_reduce"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; j++) + { + result = C::combine(result, inptr[i + j]); + } + + for (size_t j = 0; j < wg_size; j++) + { + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + } + } + return 0; + } +}; + +template <typename C> struct ScanInclusive +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_scan_inclusive"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo<Type>::deviceName; + static constexpr const char *kernelName = "test_wg_scan_inclusive"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; ++j) + { + result = C::combine(result, inptr[i + j]); + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + } + } + return 0; + } +}; + +template <typename C> struct ScanExclusive +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_scan_exclusive"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo<Type>::deviceName; + static constexpr const char *kernelName = "test_wg_scan_exclusive"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; ++j) + { + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + result = C::combine(result, inptr[i + j]); + } + } + return 0; + } +}; + +template <typename TestInfo> +static int run_test(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + using T = typename TestInfo::Type; + + cl_int err = CL_SUCCESS; + + clProgramWrapper program; + clKernelWrapper kernel; + + std::string funcName = TestInfo::testName; + funcName += "_"; + funcName += TestInfo::testOpName; + + std::string kernelName = TestInfo::kernelName; + kernelName += "_"; + kernelName += TestInfo::testOpName; + kernelName += "_"; + kernelName += TestInfo::deviceTypeName; + + std::string kernelString = + make_kernel_string(TestInfo::deviceTypeName, kernelName, funcName); + + const char *kernel_source = kernelString.c_str(); + err = create_single_kernel_helper(context, &program, &kernel, 1, + &kernel_source, kernelName.c_str()); + test_error(err, "Unable to create test kernel"); + + size_t wg_size[1]; + err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); + test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); + + clMemWrapper src = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(T) * n_elems, NULL, &err); + test_error(err, "Unable to create source buffer"); + + clMemWrapper dst = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(T) * n_elems, NULL, &err); + test_error(err, "Unable to create destination buffer"); + + std::vector<T> input_ptr(n_elems); + + MTdataHolder d(gRandomSeed); + for (int i = 0; i < n_elems; i++) + { + input_ptr[i] = (T)genrand_int64(d); + } + + err = clEnqueueWriteBuffer(queue, src, CL_TRUE, 0, sizeof(T) * n_elems, + input_ptr.data(), 0, NULL, NULL); + test_error(err, "clWriteBuffer to initialize src buffer failed"); + + err = clSetKernelArg(kernel, 0, sizeof(src), &src); + test_error(err, "Unable to set src buffer kernel arg"); + err |= clSetKernelArg(kernel, 1, sizeof(dst), &dst); + test_error(err, "Unable to set dst buffer kernel arg"); + + size_t global_work_size[] = { (size_t)n_elems }; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, + wg_size, 0, NULL, NULL); + test_error(err, "Unable to enqueue test kernel"); + + std::vector<T> output_ptr(n_elems); + + cl_uint dead = 0xdeaddead; + memset_pattern4(output_ptr.data(), &dead, sizeof(T) * n_elems); + err = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0, sizeof(T) * n_elems, + output_ptr.data(), 0, NULL, NULL); + test_error(err, "clEnqueueReadBuffer to read read dst buffer failed"); + + if (TestInfo::verify(input_ptr.data(), output_ptr.data(), n_elems, + wg_size[0])) + { + log_error("%s_%s %s failed\n", TestInfo::testName, TestInfo::testOpName, + TestInfo::deviceTypeName); + return TEST_FAIL; + } + + log_info("%s_%s %s passed\n", TestInfo::testName, TestInfo::testOpName, + TestInfo::deviceTypeName); + return TEST_PASS; +} + +int test_work_group_reduce_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test<Reduce<Add<cl_int>>>(device, context, queue, n_elems); + result |= run_test<Reduce<Add<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test<Reduce<Add<cl_long>>>(device, context, queue, n_elems); + result |= + run_test<Reduce<Add<cl_ulong>>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_reduce_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test<Reduce<Max<cl_int>>>(device, context, queue, n_elems); + result |= run_test<Reduce<Max<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test<Reduce<Max<cl_long>>>(device, context, queue, n_elems); + result |= + run_test<Reduce<Max<cl_ulong>>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_reduce_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test<Reduce<Min<cl_int>>>(device, context, queue, n_elems); + result |= run_test<Reduce<Min<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test<Reduce<Min<cl_long>>>(device, context, queue, n_elems); + result |= + run_test<Reduce<Min<cl_ulong>>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanInclusive<Add<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanInclusive<Add<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanInclusive<Add<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanInclusive<Add<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanInclusive<Max<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanInclusive<Max<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanInclusive<Max<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanInclusive<Max<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanInclusive<Min<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanInclusive<Min<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanInclusive<Min<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanInclusive<Min<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanExclusive<Add<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanExclusive<Add<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanExclusive<Add<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanExclusive<Add<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanExclusive<Max<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanExclusive<Max<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanExclusive<Max<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanExclusive<Max<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test<ScanExclusive<Min<cl_int>>>(device, context, queue, n_elems); + result |= + run_test<ScanExclusive<Min<cl_uint>>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test<ScanExclusive<Min<cl_long>>>(device, context, queue, + n_elems); + result |= run_test<ScanExclusive<Min<cl_ulong>>>(device, context, queue, + n_elems); + } + + return result; +} diff --git a/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp new file mode 100644 index 00000000..aa02391c --- /dev/null +++ b/test_conformance/workgroups/test_wg_suggested_local_work_size.cpp @@ -0,0 +1,611 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "harness/compat.h" + +#include <stdio.h> +#include <iostream> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "procs.h" +#include <CL/cl_ext.h> + +/** @brief Gets the number of elements of type s in a fixed length array of s */ +#define NELEMS(s) (sizeof(s) / sizeof((s)[0])) +#define test_error_ret_and_free(errCode, msg, retValue, ptr) \ + { \ + auto errCodeResult = errCode; \ + if (errCodeResult != CL_SUCCESS) \ + { \ + print_error(errCodeResult, msg); \ + free(ptr); \ + return retValue; \ + } \ + } + +const char* wg_scan_local_work_group_size = R"( + bool is_zero_linear_id() + { + size_t linear_id; +#if __OPENCL_VERSION__ < CL_VERSION_2_0 + linear_id = ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) * get_global_size(0)) + + ((get_global_id(1) - get_global_offset(1)) * get_global_size(0)) + + (get_global_id(0) - get_global_offset(0)); +#else + linear_id = get_global_linear_id(); +#endif + return linear_id == 0; + } + + uint get_l_size(size_t dim) + { +#if __OPENCL_VERSION__ < CL_VERSION_2_0 + return get_local_size(dim); +#else + return get_enqueued_local_size(dim); +#endif + } + + __kernel void test_wg_scan_local_work_group_size(global uint *output) + { + if(!is_zero_linear_id()) return; + for (uint i = 0; i < 3; i++) + { + output[i] = get_l_size(i); + } + } + __kernel void test_wg_scan_local_work_group_size_static_local( + global uint *output) + { + __local char c[LOCAL_MEM_SIZE]; + + if(!is_zero_linear_id()) return; + for (uint i = 0; i < 3; i++) + { + output[i] = get_l_size(i); + } + } + __kernel void test_wg_scan_local_work_group_size_dynlocal( + global uint *output, + __local char * c) + { + if(!is_zero_linear_id()) return; + for (uint i = 0; i < 3; i++) + { + output[i] = get_l_size(i); + } + };)"; + +bool is_prime(size_t a) +{ + size_t c; + + for (c = 2; c < a; c++) + { + if (a % c == 0) return false; + } + return true; +} + +bool is_not_prime(size_t a) { return !is_prime(a); } + +bool is_not_even(size_t a) { return (is_prime(a) || (a % 2 == 1)); } + +bool is_not_odd(size_t a) { return (is_prime(a) || (a % 2 == 0)); } + +#define NELEMS(s) (sizeof(s) / sizeof((s)[0])) +/* The numbers we chose in the value_range are to be used for the second and + third dimension of the global work group size. The numbers below cover many + different cases: 1024 is a power of 2, 3 is an odd and small prime number, 12 + is a multiple of 4 but not a power of 2, 1031 is a large odd and prime number + and 1 is to test the lack of this dimension if the others are present */ +const size_t value_range[] = { 1024, 3, 12, 1031, 1 }; +/* The value_range_nD contains numbers to be used for the experiments with 2D + and 3D global work sizes. This is because we need smaller numbers so that the + resulting number of work items is meaningful and does not become too large. + The cases here are: 64 that is a power of 2, 3 is an odd and small prime + number, 12 is a multiple of 4 but not a power of 2, 113 is a large prime + number + and 1 is to test the lack of this dimension if the others are present */ +const size_t value_range_nD[] = { 64, 3, 12, 113, 1 }; +const size_t basic_increment = 16; +const size_t primes_increment = 1; +enum num_dims +{ + _1D = 1, + _2D = 2, + _3D = 3 +}; + +int do_test(cl_device_id device, cl_context context, cl_command_queue queue, + cl_kernel scan_kernel, int work_dim, size_t global_work_offset[3], + size_t test_values[3], size_t dyn_mem_size) +{ + size_t local_work_size[] = { 1, 1, 1 }; + size_t suggested_total_size; + size_t workgroupinfo_size; + cl_uint kernel_work_size[3] = { 0 }; + clMemWrapper buffer; + cl_platform_id platform; + + int err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), + &platform, NULL); + test_error_ret(err, "clGetDeviceInfo failed", -1); + clGetKernelSuggestedLocalWorkSizeKHR_fn + clGetKernelSuggestedLocalWorkSizeKHR = + (clGetKernelSuggestedLocalWorkSizeKHR_fn) + clGetExtensionFunctionAddressForPlatform( + platform, "clGetKernelSuggestedLocalWorkSizeKHR"); + + if (clGetKernelSuggestedLocalWorkSizeKHR == NULL) + { + log_info("Extension 'cl_khr_suggested_local_work_size' could not be " + "found.\n"); + return TEST_FAIL; + } + + /* Create the actual buffer, using local_buffer as the host pointer, and ask + * to copy that into the buffer */ + buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(kernel_work_size), NULL, &err); + test_error_ret(err, "clCreateBuffer failed", -1); + err = clSetKernelArg(scan_kernel, 0, sizeof(buffer), &buffer); + test_error_ret(err, "clSetKernelArg failed", -1); + if (dyn_mem_size) + { + err = clSetKernelArg(scan_kernel, 1, dyn_mem_size, NULL); + test_error_ret(err, "clSetKernelArg failed", -1); + } + err = clGetKernelSuggestedLocalWorkSizeKHR(queue, scan_kernel, work_dim, + global_work_offset, test_values, + local_work_size); + test_error_ret(err, "clGetKernelSuggestedLocalWorkSizeKHR failed", -1); + suggested_total_size = + local_work_size[0] * local_work_size[1] * local_work_size[2]; + err = clGetKernelWorkGroupInfo( + scan_kernel, device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(workgroupinfo_size), &workgroupinfo_size, NULL); + test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1); + if (suggested_total_size > workgroupinfo_size) + { + std::cout << "The suggested work group size consist of " + << suggested_total_size << " work items.\n" + << "Work items are limited by " << workgroupinfo_size + << std::endl; + std::cout << "Size from clGetKernelWorkGroupInfo: " + << workgroupinfo_size; + std::cout << "\nSize from clGetKernelSuggestedLocalWorkSizeKHR: " + << local_work_size[0] * local_work_size[1] + * local_work_size[2] + << std::endl; + return -1; + } + + err = + clEnqueueNDRangeKernel(queue, scan_kernel, work_dim, global_work_offset, + test_values, // global work size + NULL, 0, NULL, NULL); + test_error_ret(err, "clEnqueueNDRangeKernel failed", -1); + err = clEnqueueReadBuffer(queue, buffer, CL_NON_BLOCKING, 0, + sizeof(kernel_work_size), kernel_work_size, 0, + NULL, NULL); + test_error_ret(err, "clEnqueueReadBuffer failed", -1); + err = clFinish(queue); + test_error_ret(err, "clFinish failed", -1); + + if (kernel_work_size[0] != local_work_size[0] + || kernel_work_size[1] != local_work_size[1] + || kernel_work_size[2] != local_work_size[2]) + { + std::cout + << "Kernel work size differs from local work size suggested:\n" + << "Kernel work size: (" << kernel_work_size[0] << ", " + << kernel_work_size[1] << ", " << kernel_work_size[2] << ")" + << "Local work size: (" << local_work_size[0] << ", " + << local_work_size[1] << ", " << local_work_size[2] << ")\n"; + return -1; + } + return err; +} + +int do_test_work_group_suggested_local_size( + cl_device_id device, cl_context context, cl_command_queue queue, + bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr, + cl_long max_local_mem_size, size_t global_work_offset[], num_dims dim) +{ + clProgramWrapper scan_program; + clKernelWrapper scan_kernel; + int err; + size_t test_values[] = { 1, 1, 1 }; + std::string kernel_names[6] = { + "test_wg_scan_local_work_group_size", + "test_wg_scan_local_work_group_size_static_local", + "test_wg_scan_local_work_group_size_static_local", + "test_wg_scan_local_work_group_size_static_local", + "test_wg_scan_local_work_group_size_static_local", + "test_wg_scan_local_work_group_size_dynlocal" + }; + std::string str_local_mem_size[6] = { + "-DLOCAL_MEM_SIZE=1", "-DLOCAL_MEM_SIZE=1024", + "-DLOCAL_MEM_SIZE=4096", "-DLOCAL_MEM_SIZE=16384", + "-DLOCAL_MEM_SIZE=32768", "-DLOCAL_MEM_SIZE=1" + }; + size_t local_mem_size[6] = { 1, 1024, 4096, 16384, 32768, 1 }; + size_t dyn_mem_size[6] = { 0, 0, 0, 0, 0, 1024 }; + cl_ulong kernel_local_mem_size; + for (int kernel_num = 0; kernel_num < 6; kernel_num++) + { + if (max_local_mem_size < local_mem_size[kernel_num]) continue; + // Create the kernel + err = create_single_kernel_helper( + context, &scan_program, &scan_kernel, 1, + &wg_scan_local_work_group_size, (kernel_names[kernel_num]).c_str(), + (str_local_mem_size[kernel_num]).c_str()); + test_error_ret(err, + ("create_single_kernel_helper failed for kernel " + + kernel_names[kernel_num]) + .c_str(), + -1); + + // Check if the local memory used by the kernel is going to exceed the + // max_local_mem_size + err = clGetKernelWorkGroupInfo( + scan_kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(kernel_local_mem_size), &kernel_local_mem_size, NULL); + test_error_ret(err, "clGetKernelWorkGroupInfo failed", -1); + if (kernel_local_mem_size > max_local_mem_size) continue; + // return error if no number is found due to the skip condition + err = -1; + unsigned int j = 0; + size_t num_elems = NELEMS(value_range); + for (size_t i = start; i < end; i += incr) + { + if (skip_cond(i)) continue; + err = 0; + test_values[0] = i; + if (dim == _2D) test_values[1] = value_range_nD[j++ % num_elems]; + if (dim == _3D) + { + test_values[1] = value_range_nD[j++ % num_elems]; + test_values[2] = value_range_nD[rand() % num_elems]; + } + err |= do_test(device, context, queue, scan_kernel, dim, + global_work_offset, test_values, + dyn_mem_size[kernel_num]); + test_error_ret( + err, + ("do_test failed for kernel " + kernel_names[kernel_num]) + .c_str(), + -1); + } + } + return err; +} + +int test_work_group_suggested_local_size_1D(cl_device_id device, + cl_context context, + cl_command_queue queue, int n_elems) +{ + if (!is_extension_available(device, "cl_khr_suggested_local_work_size")) + { + log_info("Device does not support 'cl_khr_suggested_local_work_size'. " + "Skipping the test.\n"); + return TEST_SKIPPED_ITSELF; + } + cl_long max_local_mem_size; + cl_int err = + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(max_local_mem_size), &max_local_mem_size, NULL); + test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.", + -1); + + size_t start, end, incr; + size_t global_work_offset[] = { 0, 0, 0 }; + size_t max_work_items = 0; + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(max_work_items), &max_work_items, NULL); + + // odds + start = 1; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret( + err, "test_work_group_suggested_local_size_1D for odds failed.", -1); + log_info("test_work_group_suggested_local_size_1D odds passed\n"); + + // evens + start = 2; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret( + err, "test_work_group_suggested_local_size_1D for evens failed.", -1); + log_info("test_work_group_suggested_local_size_1D evens passed\n"); + + // primes + start = max_work_items + 1; + end = 2 * max_work_items; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret( + err, "test_work_group_suggested_local_size_1D for primes failed.", -1); + log_info("test_work_group_suggested_local_size_1D primes passed\n"); + + global_work_offset[0] = 10; + global_work_offset[1] = 10; + global_work_offset[2] = 10; + // odds + start = 1; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret(err, + "test_work_group_suggested_local_size_1D for odds with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_1D odds with " + "global_work_offset passed\n"); + + // evens + start = 2; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret(err, + "test_work_group_suggested_local_size_1D for evens with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_1D evens with " + "global_work_offset passed\n"); + + // primes + start = max_work_items + 1; + end = 2 * max_work_items; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _1D); + test_error_ret(err, + "test_work_group_suggested_local_size_1D for primes with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_1D primes with " + "global_work_offset passed\n"); + + return err; +} + +int test_work_group_suggested_local_size_2D(cl_device_id device, + cl_context context, + cl_command_queue queue, int n_elems) +{ + if (!is_extension_available(device, "cl_khr_suggested_local_work_size")) + { + log_info("Device does not support 'cl_khr_suggested_local_work_size'. " + "Skipping the test.\n"); + return TEST_SKIPPED_ITSELF; + } + cl_long max_local_mem_size; + cl_int err = + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(max_local_mem_size), &max_local_mem_size, NULL); + test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.", + -1); + + size_t start, end, incr; + size_t global_work_offset[] = { 0, 0, 0 }; + size_t max_work_items = 0; + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(max_work_items), &max_work_items, NULL); + + // odds + start = 1; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret( + err, "test_work_group_suggested_local_size_2D for odds failed.", -1); + log_info("test_work_group_suggested_local_size_2D odds passed\n"); + + // evens + start = 2; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret( + err, "test_work_group_suggested_local_size_2D for evens failed.", -1); + log_info("test_work_group_suggested_local_size_2D evens passed\n"); + + // primes + start = max_work_items + 1; + end = max_work_items + max_work_items / 4; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret( + err, "test_work_group_suggested_local_size_2D for primes failed.", -1); + log_info("test_work_group_suggested_local_size_2D primes passed\n"); + + global_work_offset[0] = 10; + global_work_offset[1] = 10; + global_work_offset[2] = 10; + + // odds + start = 1; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret(err, + "test_work_group_suggested_local_size_2D for odds with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_2D odds with " + "global_work_offset passed\n"); + + // evens + start = 2; + end = max_work_items; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret(err, + "test_work_group_suggested_local_size_2D for evens with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_2D evens with " + "global_work_offset passed\n"); + + // primes + start = max_work_items + 1; + end = max_work_items + max_work_items / 4; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _2D); + test_error_ret(err, + "test_work_group_suggested_local_size_2D for primes with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_2D primes with " + "global_work_offset passed\n"); + + return err; +} + +int test_work_group_suggested_local_size_3D(cl_device_id device, + cl_context context, + cl_command_queue queue, int n_elems) +{ + if (!is_extension_available(device, "cl_khr_suggested_local_work_size")) + { + log_info("Device does not support 'cl_khr_suggested_local_work_size'. " + "Skipping the test.\n"); + return TEST_SKIPPED_ITSELF; + } + cl_long max_local_mem_size; + cl_int err = + clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(max_local_mem_size), &max_local_mem_size, NULL); + test_error_ret(err, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.", + -1); + + size_t start, end, incr; + size_t global_work_offset[] = { 0, 0, 0 }; + size_t max_work_items = 0; + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(max_work_items), &max_work_items, NULL); + + // odds + start = 1; + end = max_work_items / 2; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret( + err, "test_work_group_suggested_local_size_3D for odds failed.", -1); + log_info("test_work_group_suggested_local_size_3D odds passed\n"); + + // evens + start = 2; + end = max_work_items / 2; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret( + err, "test_work_group_suggested_local_size_3D for evens failed.", -1); + log_info("test_work_group_suggested_local_size_3D evens passed\n"); + + // primes + start = max_work_items + 1; + end = max_work_items + max_work_items / 4; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret( + err, "test_work_group_suggested_local_size_3D for primes failed.", -1); + log_info("test_work_group_suggested_local_size_3D primes passed\n"); + + global_work_offset[0] = 10; + global_work_offset[1] = 10; + global_work_offset[2] = 10; + + // odds + start = 1; + end = max_work_items / 2; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_odd, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret(err, + "test_work_group_suggested_local_size_3D for odds with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_3D odds with " + "global_work_offset passed\n"); + + // evens + start = 2; + end = max_work_items / 2; + incr = basic_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_even, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret(err, + "test_work_group_suggested_local_size_3D for evens with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_3D evens with " + "global_work_offset passed\n"); + + // primes + start = max_work_items + 1; + end = max_work_items + max_work_items / 4; + incr = primes_increment; + err = do_test_work_group_suggested_local_size( + device, context, queue, is_not_prime, start, end, incr, + max_local_mem_size, global_work_offset, _3D); + test_error_ret(err, + "test_work_group_suggested_local_size_3D for primes with " + "global_work_offset failed.", + -1); + log_info("test_work_group_suggested_local_size_3D primes with " + "global_work_offset passed\n"); + + return err; +} |