push {r4,lr} @ (note: we push r4 too, just for 8-byte stack alignment}(Note: I just added ".syntax unified" to NetRun's boilerplate code, so you no longer need # in front of constants.)
sub sp,sp, 32 @ make plenty of space on the stack
adr r0,.myfloats @ makes r0 point to myfloats
flds s0,[r0] @ load single-precision float (from constant below)
fadds s0,s0,s0 @ add to itself
fsts s0,[sp] @ store out to the stack
mov r0,sp @ location of floats to print
mov r1,1 @ number of floats to print
bl farray_print @ print some floats (FAILS if stack is not 8-byte aligned!)
add sp,sp,32 @ hand back stack space
pop {r4,pc} @ restore link register, and return
.myfloats: @ Note that this is read-only constant space (segfault on store!)
.word 0x3F9E0419 @ floating point 1.2345
@ Generate constants above via C++: "float x=10.0; return *(int *)&x;"
push {r4,lr} @ (note: we push r4 too, just for 8-byte stack alignment}
sub sp,sp, 32 @ make plenty of space on the stack
@ Enter vector compute mode
FMRX r12,FPSCR @ copy FPSCR into r12
BIC r12,r12,#0x00370000 @ clears STRIDE and LEN
ORR r12,r12,#0x00030000 @ sets STRIDE = 1, LEN = 4
FMXR FPSCR,r12 @ copy r12 back into FPSCR
adr r0,.myfloats @ makes r0 point to myfloats
fldmias r0,{s8-s11} @ load four single-precision floats (from constants below)
fadds s8,s8,s8 @ add *four* floats (from LEN above)
fstmias sp,{s8-s11} @ store four single-precision floats (to the stack)
@ Leave vector compute mode
BIC r12,r12,#0x00370000 @ clears STRIDE =1 and LEN = 1
FMXR FPSCR,r12 @ copy r12 back into FPSCR
mov r0,sp @ location of floats to print
mov r1,4 @ number of floats to print
bl farray_print @ print some floats (FAILS if stack is not 8-byte aligned!)
add sp,sp,32 @ hand back stack space
pop {r4,pc} @ restore link register, and return
.myfloats: @ Note that this is read-only constant space (segfault on store!)
.word 0x3F9E0419 @ floating point 1.2345
.word 0x42C80000 @ floating point 100.0
.word 0x41200000 @ floating point 10.0
.word 0x4048F5C3 @ floating point 3.14
@ Generate constants above via C++: "float x=10.0; return *(int *)&x;"
Generally, the vector operations seem to be quite fast, taking only a little longer than the scalar versions. In addition, unlike many chip designers, ARM publishes detailed execution information, including cycle counts, pipeline hazards and scoreboarding, so you have something to start with during optimization!