enum {n=1000};
float src[n]={1.0,5.0,3.0,4.0};
float dest[n];
int serial_loop(void) {
for (int i=0;i<n;i++) {
if (src[i]<4.0) dest[i]=src[i]*2.0; else dest[i]=17.0;
}
return 0;
}
int sse_loop(void) {
for (int i=0;i<n;i+=4) {
fourfloats s(&src[i]);
fourfloats d=(s<4.0).if_then_else(s+s,17.0);
d.store(&dest[i]);
}
return 0;
}
int sort_loop(void) {
std::sort(&src[0],&src[n]);
return 0;
}
int foo(void) {
for (int i=0;i<n;i++) {src[i]=rand()%8;}
print_time("serial(rand)",serial_loop);
print_time("SSE(rand)",sse_loop);
print_time("sort",sort_loop);
print_time("serial(post-sort)",serial_loop);
print_time("SSE(post-sort)",sse_loop);
//farray_print(dest,4); // <- for debugging
return 0;
}
The performance of this code on our various NetRun machines is summarized here, in nanoseconds/float:
Serial (rand) | Serial (sorted) | SSE (rand) | SSE (sorted) | Sort time | |
Q6600 | 7.5 | 1.1 | 1.2 | 1.2 | 16.7 |
Core2 | 9.4 | 1.9 | 1.6 | 1.6 | 21.7 |
Pentium 4 | 12.1 | 2.8 | 1.4 | 1.4 | 31.9 |
Pentium III | 13.0 | 7.2 | 3.0 | 3.0 | 42.9 |