extern "C" float do_work(float x,float y);Here's the corresponding assembly:
int foo(void) {
float r=do_work(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
global do_work
do_work:
addss xmm0,xmm1
ret
That assembly code is translated into these 5 bytes of machine code:
0000000000000000 <do_work>:
0: f3 0f 58 c1 addss xmm0,xmm1
4: c3 ret
Here's an idea: let's skip the assembler, and set up those bytes ourselves!
typedef float (*work_fn)(float x,float y); // function pointer prototype
const char work_mc[]={
0xf3,0x0f,0x58,0xc1, // addss xmm0,xmm1
0xc3 // ret
};
int foo(void) {
work_fn f=(work_fn)work_mc; // make machine code array into callable function
float r=f(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
Note that "work_mc" starts out as just an array of bytes, but we
typecast it into a function pointer, and then run those bytes as
machine code!
0f 58 c0 addps xmm0,xmm0 |
#include <sys/mman.h>
typedef float (*do_work_fn)(float x,float y);
/* Make this region of memory executable, using "mprotect" */
void executable_region(void *start_ptr,void *end_ptr) {
long start=(long)start_ptr, end=(long)end_ptr;
start&=~0xfff; end+=0xfff; end&=~0xfff; /* round to page boundaries */
if (0!=mprotect((void *)start,end-start,PROT_READ|PROT_WRITE|PROT_EXEC))
std::cout<<"Unable to mprotect between pointers "<<start<<" and "<<end<<".\n";
}
// Lists x86 SSE instructions' machine code. Format is F3 0F <this byte> <modrm>
typedef enum {
addss=0x58, subss=0x5C,
mulss=0x59, divss=0x5E,
minss=0x5D, maxss=0x5F,
sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;
int foo(void) {
std::vector<char> mc;
std::string line;
while (std::getline(std::cin,line)) {
int D=line[0]-'0', S=line[3]-'0';
mc.push_back(0xf3); // one float (ss)
mc.push_back(0x0f); // SSE
switch (line[1]) {
case '+': mc.push_back(addss); break;
case '-': mc.push_back(subss); break;
case '*': mc.push_back(mulss); break;
case 'm': mc.push_back(minss); break;
case 'M': mc.push_back(maxss); break;
default: std::cout<<"OHNO!!! Unrecognized line "<<line<<"\n";
}
mc.push_back(0xC0+(D<<3)+S); // ModR/M byte (see below) specifying xmmD,xmmS
}
mc.push_back(0xc3); // return
executable_region(&mc[0],&mc[mc.size()-1]);
do_work_fn f=(do_work_fn)&mc[0];
float r=f(1.7,10.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
The bottom line is that creating a function's machine code at runtime isn't actually that hard--just really weird!
mod |
reg/opcode |
r/m |
2 bits, selects memory or register access mode: 0: memory at register r/m 1: memory at register r/m+byte offset 2: memory at register r/m + 32-bit offset 3: register r/m itself (not memory) |
3 bits, usually a destination register number. For some instructions, this is actually extra opcode bits. |
3 bits, usually a source register number. Treated as a pointer for mod!=3, treated as an ordinary register for mod==3. If r/m==4, indicates the real memory source is a SIB byte. |
Number |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
SSE Register |
xmm0 |
xmm1 |
xmm2 |
xmm3 |
xmm4 |
xmm5 |
xmm6 |
xmm7 |
Int Register |
eax |
ecx |
edx |
ebx |
esp |
ebp |
esi |
edi |
#include <sys/mman.h> /* Needed to work around "No eXecute" (NX) heap memory */
#include <xmmintrin.h> /* for SSE parallel float __m128's */
__m128 arr[8]; /* SSE temporary values */
/****************** Assembly support **************/
typedef void (*bar_fn)(void);
extern "C" void bar_asm(void);
bar_fn bar=bar_asm; /* function pointer pointing to current bar routine */
/* GCC-assembly version of bar routine, bar_asm */
__asm__(
".section \".text\"\n"
".globl foo\n"
".type bar_asm,@function\n"
"bar_asm:\n"
" addps %xmm1,%xmm0\n"
" mulps %xmm2,%xmm0\n"
" ret\n"
);
/* Copies values in "arr" into SSE registers, calls bar function pointer, and
copies the result in xmm0 back into arr[0]. */
int call_bar(void) {
__asm__(
" mov $arr,%ecx\n" // ecx points to "arr" array
" movaps 0x00(%ecx),%xmm0\n" // Fill SSE registers with inputs
" movaps 0x10(%ecx),%xmm1\n"
" movaps 0x20(%ecx),%xmm2\n"
" movaps 0x30(%ecx),%xmm3\n"
);
bar();
__asm__(
" movaps %xmm0,0x00(%ecx)\n" // Extract outputs
);
return 0;
}
/************** Interpreter support ************/
/* One instruction in the bar routine */
class bar_op {
public:
int i,o; /* input and output registers, 0-7 */
char op; /* operation to perform (+,-,*, etc.) */
bar_op(const char *line) { /* HACK: assumes line looks like "r1+=r0..." */
o=line[1]-'0';
i=line[5]-'0';
op=line[2];
}
};
/* Stored operations to make up the bar routine */
std::vector<bar_op> ops;
/* Interpreted bar routine: switch-and-execute */
int call_interp_bar(void) {
for (unsigned int i=0;i<ops.size();i++) {
bar_op &b=ops[i];
switch(b.op) {
case '+': arr[b.o]=_mm_add_ps(arr[b.o],arr[b.i]); break;
case '-': arr[b.o]=_mm_sub_ps(arr[b.o],arr[b.i]); break;
case '*': arr[b.o]=_mm_mul_ps(arr[b.o],arr[b.i]); break;
case '/': arr[b.o]=_mm_div_ps(arr[b.o],arr[b.i]); break;
default: printf("Unknown bar operation!\n"); exit(1);
};
}
return 0;
}
/* Dynamically assembled x86 bar routine: prepare machine code to run bar */
bar_fn bar_dynamic(void) {
typedef unsigned char byte;
std::vector<byte> *out=new std::vector<byte>; /* stores output machine code */
for (unsigned int i=0;i<ops.size();i++) {
out->push_back(0x0F); /* all SSE opcodes start with 0x0F */
bar_op &b=ops[i];
switch(b.op) {
case '+': out->push_back(0x58); break; /* SSE opcode for corresponding operation */
case '-': out->push_back(0x5C); break;
case '*': out->push_back(0x59); break;
case '/': out->push_back(0x5E); break;
default: printf("Unknown bar operation!\n"); exit(1);
};
/* Prepare ModR/M byte giving input and output registers */
int mod=3; /* register-register mode (octal 03ds) */
int destreg=b.o; /* output register */
int srcreg=b.i; /* source register */
int ModRM=(mod<<6)+(destreg<<3)+srcreg;
out->push_back((byte)ModRM);
}
out->push_back(0xc3); /* return instruction at end of routine */
/* Work around NX bit, by marking entire page as executable */
long mc_start=(long)&(*out)[0], mc_end=(long)&(*out)[out->size()-1];
mc_start&=~0xfff; mc_end=(mc_end+0xfff)&~0xfff;// page size!
mprotect((void *)mc_start,mc_end-mc_start,
PROT_READ+PROT_WRITE+PROT_EXEC);
return (bar_fn)(&(*out)[0]); /* typecast bytes to function pointer */
}
int foo(void) {
int i;
/* Read the user's input bar routine */
char line[100];
while (read_string(line)) {
if (line[0]=='r')
ops.push_back(bar_op(line));
}
/* Set up the (hardcoded) inputs */
const static float in[8][4]={
{1.0,2.0,3.0,4.0},
{10.0,20.0,30.0,40.0},
{100.0,200.0,300.0,400.0}
};
float out[4];
for (i=0;i<8;i++) arr[i]=_mm_loadu_ps(in[i]);
/* Dynamically assemble bar routine, and make a test run */
bar=bar_dynamic();
call_bar();
_mm_storeu_ps(out,arr[0]);
printf("%f %f %f %f\n",out[0],out[1],out[2],out[3]);
/* Do timings */
printf("Bar takes about %.2f ns/call in dynamic machine code\n",
time_function(call_bar)*1.0e9);
bar=bar_asm;
printf("Bar takes about %.2f ns/call in real asm\n",
time_function(call_bar)*1.0e9);
printf("Bar takes about %.2f ns/call interpreted\n",
time_function(call_interp_bar)*1.0e9);
return 0;
}