#include "epgpu.h"
#include "epgpu.cpp"
#include <iostream>
GPU_KERNEL(
scatter_floats,(__global<float *> array,float param),
{
array[i]=param;
array[10+i]=7;
}
)
int main() {
int n=20;
gpu_array<float> arr(n);
scatter_floats(arr,1.234).run(n/2);
for (int i=0;i<n;i++) std::cout<<"["<<i<<"]= "<<arr[i]<<"\n";
return 0;
}
(Try this in NetRun now!)
FILL
and non-FILL kernels are enough to write quite a few simple
applications, as I explore at the technical paper in the EPGPU download
page here:#include "epgpu.h"
#include "epgpu.cpp"
#include "lib/inc.c" /* netrun, for timing */
#include <iostream>
GPU_KERNEL(
writeFloats,(__global<float *> arr,float start),
{
arr[i]=start;
}
)
GPU_KERNEL(
totalFloats,(__global<float *> arr,int length,int step),
{
int srcIdx=step*i+step/2;
if (srcIdx<length)
arr[step*i]+=arr[srcIdx];
}
)
/* Total up the n values in this array into arr[0].
Does this recursively, using log(n) kernel invocations. */
void totalArray(gpu_array<float> &arr,int n)
{
for (int step=2;step/2<n;step*=2) {
totalFloats(arr,n,step).run((n+step-1)/step);
}
}
/* Single thread version: easy to write, but extraordinarily slow (250ns/float!) */
GPU_FILLKERNEL(float,
linearSum,(__global<float *> arr,int length),
{
result=0;
for (int i=0;i<length;i++) result+=arr[i];
}
)
int main() {
int n=8000;
gpu_array<float> arr(n);
float start=1000.23;
writeFloats(arr,start).run(n);
double t=time_in_seconds();
gpu_array<float> total(1);
total=linearSum(arr,n);
float result=total[0];
std::cout<<"Linear total="<<result<<" ("<<(time_in_seconds()-t)*1.0e6<<"us)\n";
t=time_in_seconds();
totalArray(arr,n);
result=arr[0];
std::cout<<"Recursive total="<<result<<" ("<<(time_in_seconds()-t)*1.0e6<<"us)\n";
std::cout<<"Analytic total="<<start*n<<"\n";
return 0;
}
(Try this in NetRun now!)
On the NetRun GTX 280, this gives: