CUDA到OpenCL内核
以下展示向量加法内核的转换过程包含关键差异点的代码注释CUDA原始版本__global__ void vecAdd(float* A, float* B, float* C, int N) { int i blockDim.x * blockIdx.x threadIdx.x; if (i N) C[i] A[i] B[i]; }OpenCL重写版本__kernel void vecAdd(__global float* A, __global float* B, __global float* C, int N) { int i get_global_id(0); if (i N) C[i] A[i] B[i]; }核心概念转换对照表CUDA概念OpenCL对应转换说明__global____kernel内核函数声明前缀threadIdx.xget_local_id(0)工作组内线程索引blockIdx.xget_group_id(0)工作组索引blockDim.xget_local_size(0)工作组维度cudaMallocclCreateBuffer设备内存分配__shared____local共享/局部内存声明内存管理代码对比CUDA内存操作float *d_A; cudaMalloc(d_A, size); cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);OpenCL内存操作cl_mem d_A clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, err); clEnqueueWriteBuffer(queue, d_A, CL_TRUE, 0, size, h_A, 0, NULL, NULL);执行配置差异CUDA启动配置dim3 blocks(256); dim3 grids((N blocks.x - 1) / blocks.x); vecAddgrids, blocks(d_A, d_B, d_C, N);OpenCL执行配置size_t globalSize N; size_t localSize 256; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalSize, localSize, 0, NULL, NULL);关键注意事项工作组大小必须能被全局工作项整除 OpenCL需要显式指定内存标识如CL_MEM_READ_WRITE 同步操作需使用barrier(CLK_LOCAL_MEM_FENCE)替代__syncthreads()性能优化建议查询设备最大工作组尺寸clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), maxSize, NULL);使用向量化加载提升带宽__global float4* A (__global float4*)a; float4 vec A[i/4]; // 一次加载4个float调试技巧验证内核参数正确性clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, ...);使用事件分析执行耗时cl_event event; clEnqueueNDRangeKernel(..., event); clWaitForEvents(1, event); clGetEventProfilingInfo(event, ...);