GPU programming with OpenCL and EPGPU

CS 441/641 Lecture, Dr. Lawlor

The current state of the art in GPU programming languages is a little annoying:

CUDA uses C++ on CPU or GPU and works great... on NVIDIA GPUs. Any other hardware (ATI, Intel, etc) won't work.
Microsoft's AMP uses C++0x lambdas to implement "parallel_for_each" on either CPU or GPU quite nicely. But it only works on Microsoft Windows 7 or above; any other software (OS X or Linux) won't work.
OpenCL is portable across every hardware and software platform, from OS X GPUs to Linux multicore. But it's a plain-C library implementation, which is painful.

Overall OpenCL is quite similar to CUDA:

Concept	OpenCL	CUDA
GPU function, executed in parallel	__kernel	__global__
Calling a kernel	clEnqueueNDRangeKernel	mykernel<<<blocks,threads>>>(...);
Size of a kernel	local work size chunks of global work.	blocks of threads by number of blocks.
GPU SM-local shared memory	__local	__shared__
GPU memory pointer	CPU: cl_mem GPU: __global float *	CPU: float * GPU: float *
Work allocated to one SM	local workgroup size	thread block size
Moving data between device and host	clEnqueueReadBuffer clEnqueuewriteBuffer	cudaMemcpy

For example, here's what I think is a minimum viable OpenCL program. Note that:

GPU functions ("kernels") get uploaded as a string, and compiled at runtime. This enables portability (no binary compatibility needed), and some cool metaprogramming options (generating code at runtime), but quoting everything is annoying.
Starting up the program takes at least a half dozen function calls.
Calling a kernel takes another half dozen function calls.
Every kernel argument gets uploaded with a separate function call. There isn't even any type checking!

Here's the source code:

/**
  "simple" OpenCL example program

Dr. Orion Sky Lawlor, lawlor@alaska.edu, 2011-05-29 (Public Domain)
*/
#include <iostream>
#include <stdio.h>
#include <assert.h>
#include "CL/cl.h"
#define CL_CHECK_ERROR(err) do{if (err) {printf("FATAL ERROR %d at " __FILE__ ":%d\n",err,__LINE__); exit(1); } } while(0)

cl_device_id theDev;
cl_context theCtx;
cl_command_queue theQueue;
cl_kernel theKrnl;
cl_program theProg;

static const char *theSource="/* Lots more code here! */\n"
"__kernel void writeArr(__global float *arr,float v) {\n"
"	int i=get_global_id(0);\n"
"	arr[i]=v;\n"
"}\n";


int main()
{
	cl_int err;

// Set up OpenCL
	// Get the platform
	enum {MAX_PLAT=8, MAX_DEVS=8};
	cl_platform_id platforms[MAX_PLAT];
	cl_uint num_platforms=MAX_PLAT;
	err= clGetPlatformIDs(MAX_PLAT,platforms,&num_platforms);
	CL_CHECK_ERROR(err);
	cl_platform_id cpPlatform=platforms[0];

	//Get the devices
	cl_device_id cdDevices[MAX_DEVS];
	err=clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, MAX_DEVS, cdDevices, NULL);
	theDev=cdDevices[0];
	CL_CHECK_ERROR(err);

	// now get the context
	theCtx = clCreateContext(NULL, 1, &theDev, NULL, NULL, &err);
	CL_CHECK_ERROR(err);

	// get a queue
	theQueue = clCreateCommandQueue(theCtx, theDev, CL_QUEUE_PROFILING_ENABLE,
                                	&err);
	CL_CHECK_ERROR(err);

// Create the program...
	theProg = clCreateProgramWithSource(theCtx, 1, &theSource, NULL, &err);
	CL_CHECK_ERROR(err);

	// ...and build it
	const char * args = " -cl-mad-enable  -cl-fast-relaxed-math ";

	err = clBuildProgram(theProg, 0, NULL, args, NULL, NULL);
	if (err != CL_SUCCESS) {
    	char* log = NULL;
    	size_t bytesRequired = 0;
    	err = clGetProgramBuildInfo(theProg, 
                                	theDev, 
                                	CL_PROGRAM_BUILD_LOG,
                                	0,
                                	NULL,
                                	&bytesRequired );
    	log = (char*)malloc( bytesRequired + 1 );
    	err = clGetProgramBuildInfo(theProg, 
                                	theDev, 
                                	CL_PROGRAM_BUILD_LOG,
                                	bytesRequired,
                                	log,
                                	NULL );
    	std::cout << "Build error: "<< log << std::endl;
    	free( log );
		exit(-1);
	}

// Set up input memory
	int n=64; int bytes=n*sizeof(float);
	cl_mem devP = clCreateBuffer(theCtx, CL_MEM_READ_WRITE, bytes, 
                                	  NULL, &err);
	CL_CHECK_ERROR(err);

	float f=1.2345;
	err = clEnqueueWriteBuffer(theQueue, devP, CL_TRUE, 
                                	  0, sizeof(float), &f, 0, NULL, NULL);
	CL_CHECK_ERROR(err);


	// Create kernel
	theKrnl = clCreateKernel(theProg, "writeArr", &err);
	CL_CHECK_ERROR(err);

// Call the kernel
	err=clSetKernelArg(theKrnl, 0, sizeof(cl_mem), &devP);
	CL_CHECK_ERROR(err);
	
	float addThis=1000;
	err=clSetKernelArg(theKrnl, 1, sizeof(float), &addThis);
	CL_CHECK_ERROR(err);

	size_t localsz = 32;
	size_t globalsz = n;
	err = clEnqueueNDRangeKernel(theQueue, theKrnl, 1, NULL, 
                            	 &globalsz, &localsz, 0, 
                            	 NULL, NULL);
	CL_CHECK_ERROR(err);

// Read back the results
	for (int i=0;i<n;i+=4) {
		err = clEnqueueReadBuffer(theQueue, devP, CL_TRUE, 
              i*sizeof(float), sizeof(float), &f, 0, NULL, NULL);
		CL_CHECK_ERROR(err);

		std::cout<<"arr["<<i<<"]="<<f<<"\n";
	}

// Cleanup
	clReleaseMemObject(devP);
	clReleaseKernel(theKrnl);
	clReleaseProgram(theProg);
	clReleaseCommandQueue(theQueue);
	clReleaseContext(theCtx);

	return 0;
}

(Try this in NetRun now!)

OpenCL Wrappers

OpenCL is pretty painful to use to get anything done. But being a library, at least you can wrap it in something simpler.

Last year I designed a small OpenCL wrapper library called EPGPU to simplify OpenCL. Here's the above program in EPGPU.

#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>

GPU_FILLKERNEL(float, /* return type */
 write_data,(float value),  /* function name and parameters */
 {
	result=value;    /* GPU code */
 }
)

int main() {
	int n=64;
	gpu_vector<float> arr(n);
	arr=write_data(1000);

	std::vector<float> cpu_arr(n);
	arr.read(&cpu_arr[0]);
	for (int i=0;i<n;i+=4) std::cout<<"arr["<<i<<"]="<<cpu_arr[i]<<"\n";

	return 0;
}

(Try this in NetRun now!)

Note that GPU_FILLKERNEL is a macro that does most of the hard work: it stringifies the given code, and builds a C++ function object that will issue the appropriate OpenCL calls to register and pass arguments to the new kernel.

Here's a slightly more complex Mandelbrot set rendering example:

#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>
#include <fstream>

GPU_FILLKERNEL_2D(char,
 mandelbrot,(),
	float scale=2.0/1000;
	float cr=i*scale-1.0; float ci=j*scale;
	float zr=cr; float zi=ci;
	int count;
	enum {max_count=255};
	for (count=0;count<max_count;count++) {
		if ((zr*zr+zi*zi)>4.0) break;
		float tzr=zr*zr-zi*zi+cr;
		zi=2.0f*zr*zi+ci;
		zr=tzr;
	}
	result=count;
)

int main() {
	std::cout<<"Starting up\n"; std::cout.flush();
	int w=1000,h=1000;
	gpu_array2d<char> arr(w,h);
	std::cout<<"Array created\n"; std::cout.flush();
	arr=mandelbrot();
	std::cout<<"Rendered\n"; std::cout.flush();

	char arrCPU[w*h];
	arr.read(arrCPU);

	std::ofstream of("out.ppm");
	of<<"P5\n"<<w<<" "<<h<<" 255\n"; of.write(arrCPU,w*h);
	return 0;
}

(Try this in NetRun now!)