'OpenCL: Is 64 bit global_id() not supported?

I'm an OpenCL newbie and I cannot return 64 bit values from the compiled kernel. What do I wrong?

I have an Intel(R) HD Graphics 520 graphics card and I wanted to write an algorithm which process 64 bit values. But when the global id exceeded 4e12 (more precisely 2^32-1) it seems it is overflown. It seems everything is build in x64. I compile with Visual Studio 2019, target: x64. I have installed the latest Intel graphics device driver (30.0.101.1660). It can build the kernel and it works, except that it is using 32 bits and not 64 bits!

Could anyone help me what do I wrong?

Here is my code. Sorry, it is a bit longish... I have tried to be as short as it can be. I know, it has some glitches (e.g., not atomic write is used) but this is just a POC code, which not really works as I expected. :(

#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;

#include <CL/opencl.h>

int runCL(const cl_ulong n) {
    cl_int err = 0;
    cl_uint num_platforms;
    cl_platform_id platforms[16]; // Can be on stack!
    err = clGetPlatformIDs(16, platforms, &num_platforms);
    assert(err == 0);
    assert(num_platforms);

    cl_uint num_devices;
    cl_device_id devices[16]; // Can be on stack!
    err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
    assert(err == 0);
    assert(num_devices);

#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);\
    assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; \
    cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);\
    assert(err == 0); char* wrk = new char[size];\
    err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);\
    assert(err == 0); string s(wrk); delete[] wrk;\
    cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { \
    type *wrk = new type[arr_len]; \
    cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);\
    assert(err == 0); cout << #name << ":";\
    for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)

    PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
    PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
    PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
    PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
    PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
    PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
    const size_t max_item_dim = 
        PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
    cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
    PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);

    cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
    assert(err == 0);

    string kernel_txt(
        "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
        "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
        "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n"
        "__kernel void render(__global ulong * out) {\n"
        "  size_t gid = get_global_id(0);\n"
        "  size_t lid = get_local_id(0);\n"
        "  ulong val = out[lid];\n"
        "  out[lid] = val < gid ? gid : val;\n" // Not atomic!
        "  if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);\n"
        "}\n");
    const char* kernel_mem = kernel_txt.c_str();
    // kernel_mem cannot be on stack
    cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
    assert(err == 0);

    //https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#compiler-options
    const char* options = "-w -Werror -cl-std=CL3.0";
    err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
    if (err) {
        cerr << "Build error: " << err << endl;
        size_t size = 0;
        // Just get log size first, then read it again to the proper log
        cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
        char* plog = new char[size];
        err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
        cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
        delete[] plog;
        exit(1);
    }
    cl_kernel kernel = clCreateKernel(program, "render", &err);
    assert(err == 0);

    cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
    size_t buffer_size = sizeof(cl_ulong) * max_item_dim;

    cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
    assert(err == 0);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
    assert(err == 0);

    cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
    assert(err == 0);

    size_t dev_wrk_size[1] = { n };
    size_t dev_wrk_offs[1] = { 0 };
    size_t loc_wrk_size[1] = { (size_t)max_item_dim };
    // https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
    err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
    assert(err == 0);

    // Non-blocking read, so we can continue queuing up more kernels
    err = clEnqueueReadBuffer(cmd_queue, image, CL_FALSE, 0, buffer_size, host_image, 0, NULL, NULL);
    assert(err == 0);

    err = clFinish(cmd_queue);
    assert(err == 0);

    for (int i = 0; i < 256; ++i) cout << '[' << i << ':' << host_image[i] << "]";
    cout << '{' << n << '}' << endl;
    for (int i = 0; i < 256; ++i) printf("[%d:%zd]", i, host_image[i]);
    printf("{%zd}\nsize_t:%zd, cl_ulong:%zd\n", n, sizeof(size_t), sizeof(cl_ulong));

    clReleaseMemObject(image);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(cmd_queue);
    clReleaseContext(context);
    delete[] host_image;

    return CL_SUCCESS;
}

int main() {
    runCL(10'000'000'000ULL);
    return 0;
}

In the kernel the last returned item (out[255]) contains the combination of size of ulong and size_t as 8008, which seems to be ok as both are 8 bytes long.

And the output (I cut the repeated lines):

CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
size_t:8, cl_ulong:8

I printed the result with cout and printf (%zd) to be sure that not cout cause the problem. :)

Should I turn on something in clBuildProgram or in the kernel code (#pragma) to be able to use 64 bit on the kernel side?

UPDATE

I did a slight modification in the kernel code to count the number of bits of global_id(0) and it seems it is always 32, not above!

"  int i = 0; for(; i<64 && gid;++i, gid>>=1);"
"  out[lid] = val < i ? i : val;\n" // Not atomic!

So, it seems that global_id(0) returns a 32 bit value!

UPDATE2

I modified the kernel code to size_t gid = get_local_id(0)+get_local_size(0)*get_group_id(0); instead of size_t gid = get_global_id(0);.

The result become:

[0:9999999744][1:9999999745]<...>[253:9999998973][254:9999998974][255:8008]{10000000000}
size_t:8, cl_ulong:8

Which looks much better!

I also did a test to avoid race condition using atomic compare and exchange to be more pedantic:

"  size_t gid = get_global_id(0);\n"
"  size_t lid = get_local_id(0);\n"
"  //out[lid] = gid;\n" // Not atomic!
"  ulong val_new, val_org = out[lid];\n"
"  do {\n"
"    val_new = val_org > gid ? val_org : gid;\n"
"  } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));\n"

Result is the same (bad):

[0:4294967040][1:4294967041]<...>[253:4294967293][254:4294967294][255:8008]{10000000000}

c++gpu 64-bit opencl intel

Solution 1:^[1]

I have reported the issue to Intel. They answered >here<. The answer in short:

some of our hardware counters that feed into the global ID calculation are limited to 32 bits, specifically the work-group ID. ...

Workaround, if global size is divisible with local size.

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_global_offset(0) + get_local_id(0);

// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_local_id(0);

If not divisible, instead of get_local_size use get_enqueued_local_size.

Solution 2:^[2]

In short: 64-bit addressing is supported, as indicated by CL_DEVICE_ADDRESS_BITS: 64. Generally all OpenCL devices support 64-bit integer (unsigned long long int in C++, ulong in OpenCL C). The Intel HD 520 even supports FP64 double precision.

The issue is that you have a race condition in your kernel because you are not using atomics. Lots of threads try to write to out[lid] at the same time, and it is completely random which thread wins.

Here is the output for an Nvidia GPU and an Intel GPU. For the Intel GPU, the behaviour is random for each execution, but I occasionally do get values larger than 4294966784.

CL_DEVICE_NAME [24]: NVIDIA GeForce GTX 960M
CL_DEVICE_VERSION [16]: OpenCL 3.0 CUDA
CL_DRIVER_VERSION [7]: 511.79
CL_DEVICE_EXTENSIONS [606]: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 5
CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 1024 64
[0:18446744073709546496][1:18446744073709546497]...[254:18446744073709549822][255:8008]{10000000000}
[0:-5120][1:-5119]...[254:-1794][255:8008]{10000000000}
size_t:8, cl_ulong:8

CL_DEVICE_NAME [26]: Intel(R) HD Graphics 4600
CL_DEVICE_VERSION [12]: OpenCL 1.2
CL_DRIVER_VERSION [14]: 20.19.15.4624
CL_DEVICE_EXTENSIONS [616]: cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_ctz cl_intel_d3d11_nv12_media_sharing cl_intel_dx9_media_sharing cl_intel_motion_estimation cl_intel_simultaneous_sharing cl_intel_subgroups cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_depth_images cl_khr_dx9_media_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_gl_sharing cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 20
CL_DEVICE_MAX_WORK_GROUP_SIZE: 512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 512 512 512
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
size_t:8, cl_ulong:8

To ease OpenCL development, consider this OpenCL-Wrapper. With this, your code (not fixing the race condition error) is significantly shorter and more readable:

int main() {
    const ulong N = 10000000000ull;

    Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
    Memory<ulong> image(device, 64u); // allocate memory on both host and device
    Kernel kernel(device, N, "render", image); // kernel that runs on the device

    kernel.run(); // run add_kernel on the device
    image.read_from_device(); // copy data from device memory to host memory

    for(int i=0; i<256; i++) print("["+to_string(i)+":"+to_string(image[i])+"]");
    println("{"+to_string(N)+"}");
    println("size_t:"+to_string(sizeof(size_t))+", cl_ulong:"+to_string(sizeof(cl_ulong)));

    wait();
    return 0;
}

#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################

__kernel void render(__global ulong * out) {
  size_t gid = get_global_id(0);
  size_t lid = get_local_id(0);
  ulong val = out[lid];
  out[lid] = val < gid ? gid : val; // RACE CONDITION here
  if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong); // another race condition here
}

);} // ############################################################### end of OpenCL C code #####################################################################

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source
Solution 1
Solution 2	ProjectPhysX

'OpenCL: Is 64 bit global_id() not supported?

Solution 1:[1]

Solution 2:[2]

Sources

Related Questions

Solution 1:^[1]

Solution 2:^[2]