GPU programming with OpenCL and EPGPU

CS 441/641 Lecture, Dr. Lawlor

The current state of the art in GPU programming languages is a little annoying:
Overall OpenCL is quite similar to CUDA:
Concept
OpenCL
CUDA
GPU function, executed in parallel
__kernel
__global__
Calling a kernel
clEnqueueNDRangeKernel
mykernel<<<blocks,threads>>>(...);
Size of a kernel
local work size chunks of
global work.
blocks of threads by
number of blocks.
GPU SM-local shared memory
__local
__shared__
GPU memory pointer
CPU: cl_mem
GPU: __global float *
CPU: float *
GPU: float *
Work allocated to one SM
local workgroup size
thread block size
Moving data between device and host
clEnqueueReadBuffer
clEnqueuewriteBuffer
cudaMemcpy

For example, here's what I think is a minimum viable OpenCL program.  Note that:
Here's the source code:
/**
"simple" OpenCL example program

Dr. Orion Sky Lawlor, lawlor@alaska.edu, 2011-05-29 (Public Domain)
*/
#include <iostream>
#include <stdio.h>
#include <assert.h>
#include "CL/cl.h"
#define CL_CHECK_ERROR(err) do{if (err) {printf("FATAL ERROR %d at " __FILE__ ":%d\n",err,__LINE__); exit(1); } } while(0)

cl_device_id theDev;
cl_context theCtx;
cl_command_queue theQueue;
cl_kernel theKrnl;
cl_program theProg;

static const char *theSource="/* Lots more code here! */\n"
"__kernel void writeArr(__global float *arr,float v) {\n"
" int i=get_global_id(0);\n"
" arr[i]=v;\n"
"}\n";


int main()
{
cl_int err;

// Set up OpenCL
// Get the platform
enum {MAX_PLAT=8, MAX_DEVS=8};
cl_platform_id platforms[MAX_PLAT];
cl_uint num_platforms=MAX_PLAT;
err= clGetPlatformIDs(MAX_PLAT,platforms,&num_platforms);
CL_CHECK_ERROR(err);
cl_platform_id cpPlatform=platforms[0];

//Get the devices
cl_device_id cdDevices[MAX_DEVS];
err=clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, MAX_DEVS, cdDevices, NULL);
theDev=cdDevices[0];
CL_CHECK_ERROR(err);

// now get the context
theCtx = clCreateContext(NULL, 1, &theDev, NULL, NULL, &err);
CL_CHECK_ERROR(err);

// get a queue
theQueue = clCreateCommandQueue(theCtx, theDev, CL_QUEUE_PROFILING_ENABLE,
&err);
CL_CHECK_ERROR(err);

// Create the program...
theProg = clCreateProgramWithSource(theCtx, 1, &theSource, NULL, &err);
CL_CHECK_ERROR(err);

// ...and build it
const char * args = " -cl-mad-enable -cl-fast-relaxed-math ";

err = clBuildProgram(theProg, 0, NULL, args, NULL, NULL);
if (err != CL_SUCCESS) {
char* log = NULL;
size_t bytesRequired = 0;
err = clGetProgramBuildInfo(theProg,
theDev,
CL_PROGRAM_BUILD_LOG,
0,
NULL,
&bytesRequired );
log = (char*)malloc( bytesRequired + 1 );
err = clGetProgramBuildInfo(theProg,
theDev,
CL_PROGRAM_BUILD_LOG,
bytesRequired,
log,
NULL );
std::cout << "Build error: "<< log << std::endl;
free( log );
exit(-1);
}

// Set up input memory
int n=64; int bytes=n*sizeof(float);
cl_mem devP = clCreateBuffer(theCtx, CL_MEM_READ_WRITE, bytes,
NULL, &err);
CL_CHECK_ERROR(err);

float f=1.2345;
err = clEnqueueWriteBuffer(theQueue, devP, CL_TRUE,
0, sizeof(float), &f, 0, NULL, NULL);
CL_CHECK_ERROR(err);


// Create kernel
theKrnl = clCreateKernel(theProg, "writeArr", &err);
CL_CHECK_ERROR(err);

// Call the kernel
err=clSetKernelArg(theKrnl, 0, sizeof(cl_mem), &devP);
CL_CHECK_ERROR(err);

float addThis=1000;
err=clSetKernelArg(theKrnl, 1, sizeof(float), &addThis);
CL_CHECK_ERROR(err);

size_t localsz = 32;
size_t globalsz = n;
err = clEnqueueNDRangeKernel(theQueue, theKrnl, 1, NULL,
&globalsz, &localsz, 0,
NULL, NULL);
CL_CHECK_ERROR(err);

// Read back the results
for (int i=0;i<n;i+=4) {
err = clEnqueueReadBuffer(theQueue, devP, CL_TRUE,
i*sizeof(float), sizeof(float), &f, 0, NULL, NULL);
CL_CHECK_ERROR(err);

std::cout<<"arr["<<i<<"]="<<f<<"\n";
}

// Cleanup
clReleaseMemObject(devP);
clReleaseKernel(theKrnl);
clReleaseProgram(theProg);
clReleaseCommandQueue(theQueue);
clReleaseContext(theCtx);

return 0;
}

(Try this in NetRun now!)

OpenCL Wrappers

OpenCL is pretty painful to use to get anything done.  But being a library, at least you can wrap it in something simpler. 

Last year I designed a small OpenCL wrapper library called EPGPU to simplify OpenCL.  Here's the above program in EPGPU.
#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>

GPU_FILLKERNEL(float, /* return type */
write_data,(float value), /* function name and parameters */
{
result=value; /* GPU code */
}
)

int main() {
int n=64;
gpu_vector<float> arr(n);
arr=write_data(1000);

std::vector<float> cpu_arr(n);
arr.read(&cpu_arr[0]);
for (int i=0;i<n;i+=4) std::cout<<"arr["<<i<<"]="<<cpu_arr[i]<<"\n";

return 0;
}

(Try this in NetRun now!)

Note that GPU_FILLKERNEL is a macro that does most of the hard work: it stringifies the given code, and builds a C++ function object that will issue the appropriate OpenCL calls to register and pass arguments to the new kernel.

Here's a slightly more complex Mandelbrot set rendering example:
#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>
#include <fstream>

GPU_FILLKERNEL_2D(char,
mandelbrot,(),
float scale=2.0/1000;
float cr=i*scale-1.0; float ci=j*scale;
float zr=cr; float zi=ci;
int count;
enum {max_count=255};
for (count=0;count<max_count;count++) {
if ((zr*zr+zi*zi)>4.0) break;
float tzr=zr*zr-zi*zi+cr;
zi=2.0f*zr*zi+ci;
zr=tzr;
}
result=count;
)

int main() {
std::cout<<"Starting up\n"; std::cout.flush();
int w=1000,h=1000;
gpu_array2d<char> arr(w,h);
std::cout<<"Array created\n"; std::cout.flush();
arr=mandelbrot();
std::cout<<"Rendered\n"; std::cout.flush();

char arrCPU[w*h];
arr.read(arrCPU);

std::ofstream of("out.ppm");
of<<"P5\n"<<w<<" "<<h<<" 255\n"; of.write(arrCPU,w*h);
return 0;
}

(Try this in NetRun now!)