global foo(executable NetRun link)
foo:
movups xmm1,[thing2]; <- copy the four floats into xmm1
addps xmm1,xmm1; <- add floats to themselves
movups [thing1],xmm1; <- move that constant into the global "retval"
; Print out retval
extern farray_print
push 4 ;<- number of floats to print
push thing1 ;<- points to array of floats
call farray_print
add esp,8 ; <- pop off arguments
ret
section .data
thing1:
dd 0.0 ;<- our return value
dd 0.0 ;
dd 0.0 ;
dd 0.0 ;
thing2:
dd 10.2;<- source constant
dd 100.2
dd 1000.2
dd 10000.2
__m128 _mm_load_ps(float *src) |
Load 4 floats from a 16-byte aligned address. |
__m128 _mm_loadu_ps(float *src) | Load from an unaligned address (4x slower!) |
__m128 _mm_load1_ps(float *src) | Load 1 float into all 4 fields of an __m128 |
__m128 _mm_setr_ps(float a,float b,float c,float d) |
Load 4 floats from parameters into an __m128 |
void _mm_store_ps(float *dest,__m128 src) |
Store 4 floats to an aligned address. |
void _mm_storeu_ps(float *dest,__m128 src) | Store 4 floats to unaligned address |
__m128 _mm_add_ps(__m128 a,__m128 b) |
Add corresponding floats (also "sub") |
__m128 _mm_mul_ps(__m128 a,__m128 b) | Multiply corresponding floats (also "div", but it's slow) |
__m128 _mm_min_ps(__m128 a,__m128 b) | Take corresponding minimum (also "max") |
__m128 _mm_sqrt_ps(__m128 a) | Take square roots of 4 floats (12ns, slow like divide) |
__m128 _mm_rcp_ps(__m128 a) | Compute rough (12-bit accuracy) reciprocal of all 4 floats (as fast as an add!) |
__m128 _mm_rsqrt_ps(__m128 a) | Rough (12-bit) reciprocal-square-root of all 4 floats (fast) |
__m128 _mm_shuffle_ps(__m128 lo,__m128 hi, _MM_SHUFFLE(hi3,hi2,lo1,lo0)) |
Interleave inputs into low 2 floats and high 2 floats of output. Basically out[0]=lo[lo0]; out[1]=lo[lo1]; out[2]=hi[hi2]; out[3]=hi[hi3]; For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float a[i] into all 4 output floats. |
for (int i=0;i<n_vals;i++) {(executable NetRun link)
vals[i]=vals[i]*a+b;
}
for (int i=0;i<n_vals;i+=4) {(executable NetRun link)
vals[i+0]=vals[i+0]*a+b;
vals[i+1]=vals[i+1]*a+b;
vals[i+2]=vals[i+2]*a+b;
vals[i+3]=vals[i+3]*a+b;
}
for (int i=0;i<n_vals;i+=4) {(executable NetRun link)
__m128 SSEa=_mm_load_ps1(&a); /* 4 copies of a, in an SSE register */
__m128 SSEb=_mm_load_ps1(&b);
__m128 v=_mm_load_ps(&vals[i]);
v=_mm_add_ps(_mm_mul_ps(v,SSEa),SSEb);
_mm_store_ps(&vals[i],v);
}