Concept |
OpenCL |
CUDA |
GPU function, executed in parallel |
__kernel |
__global__ |
Calling a kernel |
clEnqueueNDRangeKernel |
mykernel<<<blocks,threads>>>(...); |
Size of a kernel |
local work size chunks of global work. |
blocks of threads by number of blocks. |
GPU SM-local shared memory |
__local |
__shared__ |
GPU memory pointer |
CPU: cl_mem GPU: __global float * |
CPU: float * GPU: float * |
Work allocated to one SM |
local workgroup size |
thread block size |
Moving data between device and host |
clEnqueueReadBuffer |
cudaMemcpy |
/**
"simple" OpenCL example program
Dr. Orion Sky Lawlor, lawlor@alaska.edu, 2011-05-29 (Public Domain)
*/
#include <iostream>
#include <stdio.h>
#include <assert.h>
#include "CL/cl.h"
#define CL_CHECK_ERROR(err) do{if (err) {printf("FATAL ERROR %d at " __FILE__ ":%d\n",err,__LINE__); exit(1); } } while(0)
cl_device_id theDev;
cl_context theCtx;
cl_command_queue theQueue;
cl_kernel theKrnl;
cl_program theProg;
static const char *theSource="/* Lots more code here! */\n"
"__kernel void writeArr(__global float *arr,float v) {\n"
" int i=get_global_id(0);\n"
" arr[i]=v;\n"
"}\n";
int main()
{
cl_int err;
// Set up OpenCL
// Get the platform
enum {MAX_PLAT=8, MAX_DEVS=8};
cl_platform_id platforms[MAX_PLAT];
cl_uint num_platforms=MAX_PLAT;
err= clGetPlatformIDs(MAX_PLAT,platforms,&num_platforms);
CL_CHECK_ERROR(err);
cl_platform_id cpPlatform=platforms[0];
//Get the devices
cl_device_id cdDevices[MAX_DEVS];
err=clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, MAX_DEVS, cdDevices, NULL);
theDev=cdDevices[0];
CL_CHECK_ERROR(err);
// now get the context
theCtx = clCreateContext(NULL, 1, &theDev, NULL, NULL, &err);
CL_CHECK_ERROR(err);
// get a queue
theQueue = clCreateCommandQueue(theCtx, theDev, CL_QUEUE_PROFILING_ENABLE,
&err);
CL_CHECK_ERROR(err);
// Create the program...
theProg = clCreateProgramWithSource(theCtx, 1, &theSource, NULL, &err);
CL_CHECK_ERROR(err);
// ...and build it
const char * args = " -cl-mad-enable -cl-fast-relaxed-math ";
err = clBuildProgram(theProg, 0, NULL, args, NULL, NULL);
if (err != CL_SUCCESS) {
char* log = NULL;
size_t bytesRequired = 0;
err = clGetProgramBuildInfo(theProg,
theDev,
CL_PROGRAM_BUILD_LOG,
0,
NULL,
&bytesRequired );
log = (char*)malloc( bytesRequired + 1 );
err = clGetProgramBuildInfo(theProg,
theDev,
CL_PROGRAM_BUILD_LOG,
bytesRequired,
log,
NULL );
std::cout << "Build error: "<< log << std::endl;
free( log );
exit(-1);
}
// Set up input memory
int n=64; int bytes=n*sizeof(float);
cl_mem devP = clCreateBuffer(theCtx, CL_MEM_READ_WRITE, bytes,
NULL, &err);
CL_CHECK_ERROR(err);
float f=1.2345;
err = clEnqueueWriteBuffer(theQueue, devP, CL_TRUE,
0, sizeof(float), &f, 0, NULL, NULL);
CL_CHECK_ERROR(err);
// Create kernel
theKrnl = clCreateKernel(theProg, "writeArr", &err);
CL_CHECK_ERROR(err);
// Call the kernel
err=clSetKernelArg(theKrnl, 0, sizeof(cl_mem), &devP);
CL_CHECK_ERROR(err);
float addThis=1000;
err=clSetKernelArg(theKrnl, 1, sizeof(float), &addThis);
CL_CHECK_ERROR(err);
size_t localsz = 32;
size_t globalsz = n;
err = clEnqueueNDRangeKernel(theQueue, theKrnl, 1, NULL,
&globalsz, &localsz, 0,
NULL, NULL);
CL_CHECK_ERROR(err);
// Read back the results
for (int i=0;i<n;i+=4) {
err = clEnqueueReadBuffer(theQueue, devP, CL_TRUE,
i*sizeof(float), sizeof(float), &f, 0, NULL, NULL);
CL_CHECK_ERROR(err);
std::cout<<"arr["<<i<<"]="<<f<<"\n";
}
// Cleanup
clReleaseMemObject(devP);
clReleaseKernel(theKrnl);
clReleaseProgram(theProg);
clReleaseCommandQueue(theQueue);
clReleaseContext(theCtx);
return 0;
}
#include "epgpu.h"Note that GPU_FILLKERNEL is a macro that does most of the hard work: it stringifies the given code, and builds a C++ function object that will issue the appropriate OpenCL calls to register and pass arguments to the new kernel.
#include "epgpu.cpp"
#include <iostream>
GPU_FILLKERNEL(float, /* return type */
write_data,(float value), /* function name and parameters */
{
result=value; /* GPU code */
}
)
int main() {
int n=64;
gpu_vector<float> arr(n);
arr=write_data(1000);
std::vector<float> cpu_arr(n);
arr.read(&cpu_arr[0]);
for (int i=0;i<n;i+=4) std::cout<<"arr["<<i<<"]="<<cpu_arr[i]<<"\n";
return 0;
}
#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>
#include <fstream>
GPU_FILLKERNEL_2D(char,
mandelbrot,(),
float scale=2.0/1000;
float cr=i*scale-1.0; float ci=j*scale;
float zr=cr; float zi=ci;
int count;
enum {max_count=255};
for (count=0;count<max_count;count++) {
if ((zr*zr+zi*zi)>4.0) break;
float tzr=zr*zr-zi*zi+cr;
zi=2.0f*zr*zi+ci;
zr=tzr;
}
result=count;
)
int main() {
std::cout<<"Starting up\n"; std::cout.flush();
int w=1000,h=1000;
gpu_array2d<char> arr(w,h);
std::cout<<"Array created\n"; std::cout.flush();
arr=mandelbrot();
std::cout<<"Rendered\n"; std::cout.flush();
char arrCPU[w*h];
arr.read(arrCPU);
std::ofstream of("out.ppm");
of<<"P5\n"<<w<<" "<<h<<" 255\n"; of.write(arrCPU,w*h);
return 0;
}