single/ serial |
packed/ parallel |
|
single-precision "float" |
ss |
ps (4 floats) |
double-precision "double" |
sd |
pd (2 doubles) |
enum {n=1024};Staring at the assembly language, there are a number of "cvtss2sd" and back again, due to the double-precision constants and single-precision data. So we can get a good speedup to 1.4ns/float, just by making the constants floating point.
float a[n];
for (int i=0;i<n;i++) a[i]=3.4;
for (int i=0;i<n;i++) a[i]+=1.2;
return a[0];
enum {n=1024};We can run a *lot* faster by using SSE parallel instructions. I'm going to do this the "hard way," making separate functions to do the assembly computation.
float a[n];
for (int i=0;i<n;i++) a[i]=3.4f;
for (int i=0;i<n;i++) a[i]+=1.2f;
return a[0];
extern "C" void init_array(float *arr,int n);Here are the two assembly language functions called above. Together, we're down to under 0.5ns/float!
extern "C" void add_array(float *arr,int n);
int foo(void) {
enum {n=1024};
float a[n];
init_array(a,n);
add_array(a,n);
return a[0]*1000;
}
; extern "C" void init_array(float *arr,int n);0.5ns/float is pretty impressive performance for this code, since:
;for (int i=0;i<n;i+=4) {
; a[i]=3.4f;
; a[i+1]=3.4f;
; a[i+2]=3.4f;
; a[i+3]=3.4f;
;}
global init_array
init_array:
; rdi points to arr
; rsi is n, the array length
mov rcx,0 ; i
movaps xmm1,[constant3_4]
jmp loopcompare
loopstart:
movaps [rdi+4*rcx],xmm1 ; init array with xmm1
add rcx,4
loopcompare:
cmp rcx,rsi
jl loopstart
ret
section .data
align 16
constant3_4:
dd 3.4,3.4,3.4,3.4 ; movaps!
section .text
; extern "C" void add_array(float *arr,int n);
;for (int i=0;i<n;i++) a[i]+=1.2f;
global add_array
add_array:
; rdi points to arr
; rsi is n, the array length
mov rcx,0 ; i
movaps xmm1,[constant1_2]
jmp loopcompare2
loopstart2:
movaps xmm0,[rdi+4*rcx] ; loads arr[i] through arr[i+3]
addps xmm0,xmm1
movaps [rdi+4*rcx],xmm0
add rcx,4
loopcompare2:
cmp rcx,rsi
jl loopstart2
ret
section .data
align 16
constant1_2:
dd 1.2,1.2,1.2,1.2 ; movaps!