extern "C" float do_work(float x,float y);Here's the corresponding assembly:
int foo(void) {
float r=do_work(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
global do_work
do_work:
addss xmm0,xmm1
ret
That assembly code is translated into these 5 bytes of machine code:
0000000000000000 <do_work>:
0: f3 0f 58 c1 addss xmm0,xmm1
4: c3 ret
Here's an idea: let's skip the assembler, and set up those bytes ourselves!
typedef float (*work_fn)(float x,float y); // function pointer prototype
const char work_mc[]={
0xf3,0x0f,0x58,0xc1, // addss xmm0,xmm1
0xc3 // ret
};
int foo(void) {
work_fn f=(work_fn)work_mc; // make machine code array into callable function
float r=f(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
Note that "work_mc" starts out as just an array of bytes, but we
typecast it into a function pointer, and then run those bytes as
machine code!
Here's all the SSE "float" instructions. The ModR/M byte specifies the inputs to the instruction, as defined below.
F3 0F 58 ModR/M: addss
F3 0F 5C ModR/M: subss
F3 0F 59 ModR/M: mulss
F3 0F 5E ModR/M: divss
F3 0F 5D ModR/M: minss
F3 0F 5F ModR/M: maxss
F3 0F 51 ModR/M: sqrtss
F3 0F 52 ModR/M: rsqrtss
F3 0F 53 ModR/M: rcpss
This is a pretty simple table, so we can even put together a totally
new SSE function from these instructions, like the code we built in
class:
#include <sys/mman.h>
typedef float (*do_work_fn)(float x,float y);
/* Make this region of memory executable, using "mprotect" */
void executable_region(void *start_ptr,void *end_ptr) {
long start=(long)start_ptr, end=(long)end_ptr;
start&=~0xfff; end+=0xfff; end&=~0xfff; /* round to page boundaries */
if (0!=mprotect((void *)start,end-start,PROT_READ|PROT_WRITE|PROT_EXEC))
std::cout<<"Unable to mprotect between pointers "<<start<<" and "<<end<<".\n";
}
// Lists x86 SSE instructions' machine code. Format is F3 0F <this byte> <modrm>
typedef enum {
addss=0x58, subss=0x5C,
mulss=0x59, divss=0x5E,
minss=0x5D, maxss=0x5F,
sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;
int foo(void) {
std::vector<char> mc;
std::string line;
while (std::getline(std::cin,line)) {
int D=line[0]-'0', S=line[3]-'0';
mc.push_back(0xf3); // float
mc.push_back(0x0f); // SSE
switch (line[1]) {
case '+': mc.push_back(addss); break;
case '-': mc.push_back(subss); break;
case '*': mc.push_back(mulss); break;
case 'm': mc.push_back(minss); break;
case 'M': mc.push_back(maxss); break;
default: std::cout<<"OHNO!!! Unrecognized line "<<line<<"\n";
}
mc.push_back(0xC0+(D<<3)+S); // xmmD,xmmS
}
mc.push_back(0xc3); // return
executable_region(&mc[0],&mc[mc.size()-1]);
do_work_fn f=(do_work_fn)&mc[0];
float r=f(1.7,10.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
Here's a more object-oriented example program:
typedef float (*work_fn)(float x,float y); // function pointer prototype
// Lists x86 SSE instructions' machine code. Format is F3 0F <this byte> <modrm>
typedef enum {
addss=0x58, subss=0x5C,
mulss=0x59, divss=0x5E,
minss=0x5D, maxss=0x5F,
sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;
// Creates a new machine code function at runtime, instruction by instruction
class function_maker {
public:
function_maker() {has_ret=false;}
// Add this instruction to our function. src and dest are register numbers
void instruction(int dest,sse_instruction_type opcode,int src) {
mc.push_back(0xF3); mc.push_back(0x0F); // SSE-float prefix bytes
mc.push_back(opcode);
int modrm=(3<<6) | (dest<<3) | (src);
mc.push_back(modrm);
}
// Run the new function
float run(float x,float y) {
if (!has_ret) { // we need to add a return instruction before running
has_ret=true;
mc.push_back(0xc3); /* "ret" instruction */
}
unsigned char *mc_data=&mc[0]; // get a pointer to machine code
work_fn f=(work_fn)mc_data; // make executable
return f(x,y); // run the new function
}
private:
std::vector<unsigned char> mc; /* new function's machine code */
bool has_ret; /* did we add the "ret" instruction yet? */
};
int foo(void) {
function_maker f;
f.instruction(0,addss,1);
float x=f.run(1.7, 1000.0);
f.print();
std::cout<<"Function returned "<<x<<"\n";
return 0;
}
Depending on the machine's configuration, this may crash instead of running your new function, because std::vector's heap memory is not marked as being executable (the "No eXecute" or NX bit is set). We can change the heap's memory permission on a per-page basis using "mprotect", a function that annoyingly only works on "page aligned" pointers. You'll get many more details of this memory permissions process when you learn about page tables in CS 321.
#include <sys/mman.h> /* Needed to work around "No eXecute" (NX) heap memory */
...
/* Work around NX bit, by marking entire page as executable */
long mc_start=(long)&mc[0], mc_end=(long)&mc[mc.size()-1];
mc_start&=~0xfff; mc_end=(mc_end+0xfff)&~0xfff; /* round to page size */
mprotect((void *)mc_start,mc_end-mc_start,
PROT_READ+PROT_WRITE+PROT_EXEC);
...
Here's a way to use function_maker to translate a simple line-by-line script into machine code:
int foo(void) {
function_maker f;
std::string line;
while (std::getline(std::cin,line))
{ /* Format of lines:
rD?=rS; character
0123456 array index
*/
sse_instruction_type op; /* which operation to perform */
switch (line[2]) {
case '+': op=addss; break;
case '*': op=mulss; break;
case 'v': op=sqrtss; break;
default: std::cout<<"Unknown operation "<<line<<"!\n"; return 0;
}
f.instruction(line[1]-'0', op, line[5]-'0');
}
f.print();
std::cout<<"Running newly created function...\n";
float x=f.run(1.7, 10000.0);
std::cout<<"Function returned "<<x<<"\n";
return 0;
}
The bottom line is that creating a function's machine code at runtime isn't actually that hard!
mod |
reg/opcode |
r/m |
2 bits, selects memory or register access mode: 0: memory at register r/m 1: memory at register r/m+byte offset 2: memory at register r/m + 32-bit offset 3: register r/m itself (not memory) |
3 bits, usually a register number Usually the destination (but for dyslexic instructions, it's the source register; or occasionally even an extra opcode) |
3 bits, register number Usually defines the data source (except for dyslexic instructions) Treated as a pointer for mod!=3, treated as an ordinary register for mod==3. If r/m==4, indicates the real memory source is a SIB byte. |
Register |
eax |
ecx |
edx |
ebx |
esp |
ebp |
esi |
edi |
Number |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
Machine Code |
Assembly |
C |
0x03 0312 |
add ecx,edx |
c+=d; |
0x03 0012 | add ecx,[edx] | c+=*d; |
0x01 0012 |
add [edx],ecx |
*d+=c; |
0x01 0312 |
add edx,ecx |
d+=c; |
0xf3, 0x0f, 0x58, 0301"0xf3 0x0f 0x58" is the "add SSE scalar single" opcode, "addss". "0xc1" is the ModR/M byte, which from the "ModR/M" table gives mod==3 (a register-register operation), reg/opcode==0 (the destination register is xmm0), and R/M==1 (the source register is xmm1). So all together this is "addss xmm0,xmm1".
#include <sys/mman.h> /* Needed to work around "No eXecute" (NX) heap memory */
#include <xmmintrin.h> /* for SSE parallel float __m128's */
__m128 arr[8]; /* SSE temporary values */
/****************** Assembly support **************/
typedef void (*bar_fn)(void);
extern "C" void bar_asm(void);
bar_fn bar=bar_asm; /* function pointer pointing to current bar routine */
/* GCC-assembly version of bar routine, bar_asm */
__asm__(
".section \".text\"\n"
".globl foo\n"
".type bar_asm,@function\n"
"bar_asm:\n"
" addps %xmm1,%xmm0\n"
" mulps %xmm2,%xmm0\n"
" ret\n"
);
/* Copies values in "arr" into SSE registers, calls bar function pointer, and
copies the result in xmm0 back into arr[0]. */
int call_bar(void) {
__asm__(
" mov $arr,%ecx\n" // ecx points to "arr" array
" movaps 0x00(%ecx),%xmm0\n" // Fill SSE registers with inputs
" movaps 0x10(%ecx),%xmm1\n"
" movaps 0x20(%ecx),%xmm2\n"
" movaps 0x30(%ecx),%xmm3\n"
);
bar();
__asm__(
" movaps %xmm0,0x00(%ecx)\n" // Extract outputs
);
return 0;
}
/************** Interpreter support ************/
/* One instruction in the bar routine */
class bar_op {
public:
int i,o; /* input and output registers, 0-7 */
char op; /* operation to perform (+,-,*, etc.) */
bar_op(const char *line) { /* HACK: assumes line looks like "r1+=r0..." */
o=line[1]-'0';
i=line[5]-'0';
op=line[2];
}
};
/* Stored operations to make up the bar routine */
std::vector<bar_op> ops;
/* Interpreted bar routine: switch-and-execute */
int call_interp_bar(void) {
for (unsigned int i=0;i<ops.size();i++) {
bar_op &b=ops[i];
switch(b.op) {
case '+': arr[b.o]=_mm_add_ps(arr[b.o],arr[b.i]); break;
case '-': arr[b.o]=_mm_sub_ps(arr[b.o],arr[b.i]); break;
case '*': arr[b.o]=_mm_mul_ps(arr[b.o],arr[b.i]); break;
case '/': arr[b.o]=_mm_div_ps(arr[b.o],arr[b.i]); break;
default: printf("Unknown bar operation!\n"); exit(1);
};
}
return 0;
}
/* Dynamically assembled x86 bar routine: prepare machine code to run bar */
bar_fn bar_dynamic(void) {
typedef unsigned char byte;
std::vector<byte> *out=new std::vector<byte>; /* stores output machine code */
for (unsigned int i=0;i<ops.size();i++) {
out->push_back(0x0F); /* all SSE opcodes start with 0x0F */
bar_op &b=ops[i];
switch(b.op) {
case '+': out->push_back(0x58); break; /* SSE opcode for corresponding operation */
case '-': out->push_back(0x5C); break;
case '*': out->push_back(0x59); break;
case '/': out->push_back(0x5E); break;
default: printf("Unknown bar operation!\n"); exit(1);
};
/* Prepare ModR/M byte giving input and output registers */
int mod=3; /* register-register mode (octal 03ds) */
int destreg=b.o; /* output register */
int srcreg=b.i; /* source register */
int ModRM=(mod<<6)+(destreg<<3)+srcreg;
out->push_back((byte)ModRM);
}
out->push_back(0xc3); /* return instruction at end of routine */
/* Work around NX bit, by marking entire page as executable */
long mc_start=(long)&(*out)[0], mc_end=(long)&(*out)[out->size()-1];
mc_start&=~0xfff; mc_end=(mc_end+0xfff)&~0xfff;// page size!
mprotect((void *)mc_start,mc_end-mc_start,
PROT_READ+PROT_WRITE+PROT_EXEC);
return (bar_fn)(&(*out)[0]); /* typecast bytes to function pointer */
}
int foo(void) {
int i;
/* Read the user's input bar routine */
char line[100];
while (read_string(line)) {
if (line[0]=='r')
ops.push_back(bar_op(line));
}
/* Set up the (hardcoded) inputs */
const static float in[8][4]={
{1.0,2.0,3.0,4.0},
{10.0,20.0,30.0,40.0},
{100.0,200.0,300.0,400.0}
};
float out[4];
for (i=0;i<8;i++) arr[i]=_mm_loadu_ps(in[i]);
/* Dynamically assemble bar routine, and make a test run */
bar=bar_dynamic();
call_bar();
_mm_storeu_ps(out,arr[0]);
printf("%f %f %f %f\n",out[0],out[1],out[2],out[3]);
/* Do timings */
printf("Bar takes about %.2f ns/call in dynamic machine code\n",
time_function(call_bar)*1.0e9);
bar=bar_asm;
printf("Bar takes about %.2f ns/call in real asm\n",
time_function(call_bar)*1.0e9);
printf("Bar takes about %.2f ns/call interpreted\n",
time_function(call_interp_bar)*1.0e9);
return 0;
}