/* vim: ts=4 sw=4 sts=4 et tw=78 * Copyright (c) 2011 James R. McKaskill. See license in ffi.h */ |.if X64 |.arch x64 |.else |.arch x86 |.endif |.actionlist build_actionlist |.globalnames globnames |.externnames extnames |.if not X64 |.define RET_H, edx // for int64_t returns |.define RET_L, eax |.endif |.if X64WIN | |.macro call_rrrp, func, arg0, arg1, arg2, arg3 | mov64 r9, arg3 | mov r8, arg2 | mov rdx, arg1 | mov rcx, arg0 | call func |.endmacro |.macro call_rrrr, func, arg0, arg1, arg2, arg3 | mov r9, arg3 | mov r8, arg2 | mov rdx, arg1 | mov rcx, arg0 | call func |.endmacro | |.macro call_rrp, func, arg0, arg1, arg2 | mov64 r8, arg2 | mov rdx, arg1 | mov rcx, arg0 | call func |.endmacro |.macro call_rrr, func, arg0, arg1, arg2 | mov r8, arg2 | mov rdx, arg1 | mov rcx, arg0 | call func |.endmacro | |.macro call_rp, func, arg0, arg1 | mov64 rdx, arg1 | mov rcx, arg0 | call func |.endmacro |.macro call_rr, func, arg0, arg1 | mov rdx, arg1 | mov rcx, arg0 | call func |.endmacro | |.macro call_r, func, arg0 | mov rcx, arg0 | call func |.endmacro | |.elif X64 | | // the 5 and 6 arg forms are only used on posix x64 |.macro call_rrrrrr, func, arg0, arg1, arg2, arg3, arg4, arg5 | mov r9, arg5 | mov r8, arg4 | mov rcx, arg3 | mov rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro |.macro call_rrrrr, func, arg0, arg1, arg2, arg3, arg4 | mov r8, arg4 | mov rcx, arg3 | mov rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro | |.macro call_rrrp, func, arg0, arg1, arg2, arg3 | mov64 rcx, arg3 | mov rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro |.macro call_rrrr, func, arg0, arg1, arg2, arg3 | mov rcx, arg3 | mov rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro | |.macro call_rrp, func, arg0, arg1, arg2 | mov64 rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro |.macro call_rrr, func, arg0, arg1, arg2 | mov rdx, arg2 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro | |.macro call_rp, func, arg0, arg1 | mov64 rsi, arg1 | mov rdi, arg0 | call func |.endmacro |.macro call_rr, func, arg0, arg1 | mov rsi, arg1 | mov rdi, arg0 | call func |.endmacro | |.macro call_r, func, arg0 | mov rdi, arg0 | call func |.endmacro | |.else | // define the 64bit registers to the 32 bit counterparts, so the common | // code can use r*x for all pointers |.define rax, eax |.define rcx, ecx |.define rdx, edx |.define rsp, esp |.define rbp, ebp |.define rdi, edi |.define rsi, esi |.define mov64, mov | |.macro call_rrrr, func, arg0, arg1, arg2, arg3 | mov dword [rsp+12], arg3 | mov dword [rsp+8], arg2 | mov dword [rsp+4], arg1 | mov dword [rsp], arg0 | call func |.endmacro |.macro call_rrr, func, arg0, arg1, arg2 | mov dword [rsp+8], arg2 | mov dword [rsp+4], arg1 | mov dword [rsp], arg0 | call func |.endmacro |.macro call_rr, func, arg0, arg1 | mov dword [rsp+4], arg1 | mov dword [rsp], arg0 | call func |.endmacro |.macro call_r, func, arg0 | mov dword [rsp], arg0 | call func |.endmacro | |.define call_rrrp, call_rrrr |.define call_rrp, call_rrr |.define call_rp, call_rr | |.endif #if defined _WIN64 || defined __amd64__ #define JUMP_SIZE 14 #else #define JUMP_SIZE 4 #endif #define MIN_BRANCH INT32_MIN #define MAX_BRANCH INT32_MAX #define BRANCH_OFF 4 static void compile_extern_jump(struct jit* jit, lua_State* L, cfunction func, uint8_t* code) { /* The jump code is the function pointer followed by a stub to call the * function pointer. The stub exists in 64 bit so we can jump to functions * with an offset greater than 2 GB. * * Note we have to manually set this up since there are commands buffered * in the jit state and dynasm doesn't support rip relative addressing. * * eg on 64 bit: * 0-8: function ptr * 8-14: jmp aword [rip-14] * * for 32 bit we only set the function ptr as it can always fit in a 32 * bit displacement */ #if defined _WIN64 || defined __amd64__ *(cfunction*) code = func; code[8] = 0xFF; /* FF /4 operand for jmp */ code[9] = 0x25; /* RIP displacement */ *(int32_t*) &code[10] = -14; #else *(cfunction*) code = func; #endif } void compile_globals(struct jit* jit, lua_State* L) { struct jit* Dst = jit; int* perr = &jit->last_errno; dasm_setup(Dst, build_actionlist); /* Note: since the return code uses EBP to reset the stack pointer, we * don't have to track the amount of stack space used. It also means we * can handle stdcall and cdecl with the same code. */ /* Note the various call_* functions want 32 bytes of 16 byte aligned * stack */ |.if X64 |.define L_ARG, r12 |.define TOP, r13 |.else |.define L_ARG, rdi |.define TOP, rsi |.endif |.macro epilog |.if X64 | mov TOP, [rbp-16] | mov L_ARG, [rbp-8] |.else | mov TOP, [rbp-8] | mov L_ARG, [rbp-4] |.endif | mov rsp, rbp | pop rbp | ret |.endmacro |.macro get_errno // note trashes registers | call extern GetLastError | mov64 rcx, perr | mov dword [rcx], eax |.endmacro /* the general idea for the return functions is: * 1) Save return value on stack * 2) Call get_errno (this trashes the registers hence #1) * 3) Unpack return value from stack * 4) Call lua push function * 5) Set eax to number of returned args (0 or 1) * 6) Call return which pops our stack frame */ |->lua_return_arg: | mov eax, 1 | epilog |->lua_return_void: | get_errno | mov eax, 0 | epilog |->lua_return_double: |.if X64 | movq qword [rsp+32], xmm0 |.else | fstp qword [rsp+4] // note get_errno doesn't require any stack on x86 |.endif | | get_errno | |.if X64WIN | movq xmm1, qword [rsp+32] | mov rcx, L_ARG |.elif X64 | movq xmm0, qword [rsp+32] | mov rdi, L_ARG |.else | mov [rsp], L_ARG |.endif | call extern lua_pushnumber | jmp ->lua_return_arg |->lua_return_bool: | movzx eax, al | mov [rsp+32], eax | get_errno | mov eax, [rsp+32] | call_rr extern lua_pushboolean, L_ARG, rax | jmp ->lua_return_arg |->lua_return_int: | mov [rsp+32], eax | get_errno | mov eax, [rsp+32] | call_rr extern push_int, L_ARG, rax | jmp ->lua_return_arg |->lua_return_uint: | mov [rsp+32], eax | get_errno | mov eax, [rsp+32] | call_rr extern push_uint, L_ARG, rax | jmp ->lua_return_arg |->too_few_arguments: | mov ax, 0 | call_rp extern luaL_error, L_ARG, &"too few arguments" |->too_many_arguments: | mov ax, 0 | call_rp extern luaL_error, L_ARG, &"too many arguments" |->save_registers: | // use rbp relative so we store values in the outer stack frame |.if X64WIN | // use the provided shadow space for int registers above prev rbp and | // return address | mov [rbp+16], rcx | mov [rbp+24], rdx | mov [rbp+32], r8 | mov [rbp+40], r9 | // use the extra space we added for float registers | // -16 to store underneath previous value of L_ARG | movq qword [rbp-16], xmm0 | movq qword [rbp-24], xmm1 | movq qword [rbp-32], xmm2 | movq qword [rbp-40], xmm3 |.elif X64 | movq qword [rbp-16], xmm0 | movq qword [rbp-24], xmm1 | movq qword [rbp-32], xmm2 | movq qword [rbp-40], xmm3 | movq qword [rbp-48], xmm4 | movq qword [rbp-56], xmm5 | movq qword [rbp-64], xmm6 | movq qword [rbp-72], xmm7 | mov [rbp-80], rdi | mov [rbp-88], rsi | mov [rbp-96], rdx | mov [rbp-104], rcx | mov [rbp-112], r8 | mov [rbp-120], r9 |.else | // fastcall, -8 to store underneath previous value of L_ARG | mov [rbp-8], ecx | mov [rbp-12], edx |.endif | ret compile(Dst, L, NULL, LUA_NOREF); } int x86_return_size(lua_State* L, int usr, const struct ctype* ct) { int ret = 0; const struct ctype* mt; if (ct->calling_convention != C_CALL) { size_t i; size_t argn = lua_rawlen(L, usr); for (i = 1; i <= argn; i++) { lua_rawgeti(L, usr, (int) i); mt = (const struct ctype*) lua_touserdata(L, -1); if (mt->pointers) { ret += sizeof(void*); } else { switch (mt->type) { case DOUBLE_TYPE: case COMPLEX_FLOAT_TYPE: case INT64_TYPE: ret += 8; break; case COMPLEX_DOUBLE_TYPE: ret += 16; break; case INTPTR_TYPE: ret += sizeof(intptr_t); break; case FUNCTION_PTR_TYPE: ret += sizeof(cfunction); break; case BOOL_TYPE: case FLOAT_TYPE: case INT8_TYPE: case INT16_TYPE: case INT32_TYPE: case ENUM_TYPE: ret += 4; break; default: return luaL_error(L, "NYI - argument type"); } } lua_pop(L, 1); } } #if !defined _WIN64 && !defined __amd64__ lua_rawgeti(L, usr, 0); mt = (const struct ctype*) lua_touserdata(L, -1); if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) { ret += sizeof(void*); } lua_pop(L, 1); #endif return ret; } #ifdef _WIN64 #define MAX_REGISTERS(ct) 4 /* rcx, rdx, r8, r9 */ #elif defined __amd64__ #define MAX_INT_REGISTERS(ct) 6 /* rdi, rsi, rdx, rcx, r8, r9 */ #define MAX_FLOAT_REGISTERS(ct) 8 /* xmm0-7 */ #else #define MAX_INT_REGISTERS(ct) ((ct)->calling_convention == FAST_CALL ? 2 /* ecx, edx */ : 0) #define MAX_FLOAT_REGISTERS(ct) 0 #endif struct reg_alloc { #ifdef _WIN64 int regs; int is_float[4]; int is_int[4]; #else int floats; int ints; #endif int off; }; #ifdef _WIN64 #define REGISTER_STACK_SPACE(ct) (4*8) #elif defined __amd64__ #define REGISTER_STACK_SPACE(ct) (14*8) #else #define REGISTER_STACK_SPACE(ct) ALIGN_UP(((ct)->calling_convention == FAST_CALL ? 2*4 : 0), 15) #endif /* Fastcall: * Uses ecx, edx as first two int registers * Everything else on stack (include 64bit ints) * No overflow stack space * Pops the stack before returning * Returns int in eax, float in ST0 * We use the same register allocation logic as posix x64 with 2 int regs and 0 float regs */ static void get_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64) { /* grab the register from the shadow space */ #ifdef _WIN64 if (reg->regs < MAX_REGISTERS(ct)) { | mov rcx, [rbp + 16 + 8*reg->regs] reg->regs++; } #elif __amd64__ if (reg->ints < MAX_INT_REGISTERS(ct)) { | mov rcx, [rbp - 80 - 8*reg->ints] reg->ints++; } #else if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) { | mov ecx, [rbp - 8 - 4*reg->ints] reg->ints++; } #endif else if (is_int64) { |.if X64 | mov rcx, [rbp + reg->off] |.else | mov rcx, [rbp + reg->off] | mov rdx, [rbp + reg->off + 4] |.endif reg->off += 8; } else { | mov ecx, [rbp + reg->off] reg->off += 4; } } static void add_int(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_int64) { #ifdef _WIN64 if (reg->regs < MAX_REGISTERS(ct)) { | mov [rsp + 32 + 8*(reg->regs)], rax reg->is_int[reg->regs++] = 1; } #elif __amd64__ if (reg->ints < MAX_INT_REGISTERS(ct)) { | mov [rsp + 32 + 8*reg->ints], rax reg->ints++; } #else if (!is_int64 && reg->ints < MAX_INT_REGISTERS(ct)) { | mov [rsp + 32 + 4*reg->ints], rax reg->ints++; } #endif else if (is_int64) { |.if X64 | mov [rsp + reg->off], rax |.else | mov [rsp + reg->off], RET_L | mov [rsp + reg->off + 4], RET_H |.endif reg->off += 8; } else { | mov [rsp+reg->off], eax reg->off += 4; } } static void get_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double) { #if !defined _WIN64 && !defined __amd64__ assert(MAX_FLOAT_REGISTERS(ct) == 0); if (is_double) { | fld qword [rbp + reg->off] reg->off += 8; } else { | fld dword [rbp + reg->off] reg->off += 4; } #else int off; #ifdef _WIN64 if (reg->regs < MAX_REGISTERS(ct)) { off = -16 - 8*reg->regs; reg->regs++; } #else if (reg->floats < MAX_FLOAT_REGISTERS(ct)) { off = -16 - 8*reg->floats; reg->floats++; } #endif else { off = reg->off; reg->off += is_double ? 8 : 4; } if (is_double) { | movq xmm0, qword [rbp + off] } else { | cvtss2sd xmm0, dword [rbp + off] } #endif } static void add_float(Dst_DECL, const struct ctype* ct, struct reg_alloc* reg, int is_double) { #if !defined _WIN64 && !defined __amd64__ assert(MAX_FLOAT_REGISTERS(ct) == 0); if (is_double) { | fstp qword [rsp + reg->off] reg->off += 8; } else { | fstp dword [rsp + reg->off] reg->off += 4; } #else #ifdef _WIN64 if (reg->regs < MAX_REGISTERS(ct)) { if (is_double) { | movq qword [rsp + 32 + 8*(reg->regs)], xmm0 } else { | cvtsd2ss xmm0, xmm0 | movq qword [rsp + 32 + 8*(reg->regs)], xmm0 } reg->is_float[reg->regs++] = 1; } #else if (reg->floats < MAX_FLOAT_REGISTERS(ct)) { if (is_double) { | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0 } else { | cvtsd2ss xmm0, xmm0 | movq qword [rsp + 32 + 8*(MAX_INT_REGISTERS(ct) + reg->floats)], xmm0 } reg->floats++; } #endif else if (is_double) { | movq qword [rsp + reg->off], xmm0 reg->off += 8; } else { | cvtsd2ss xmm0, xmm0 | movd dword [rsp + reg->off], xmm0 reg->off += 4; } #endif } #if defined _WIN64 || defined __amd64__ #define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 1) #define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 1) #else #define add_pointer(jit, ct, reg) add_int(jit, ct, reg, 0) #define get_pointer(jit, ct, reg) get_int(jit, ct, reg, 0) #endif cfunction compile_callback(lua_State* L, int fidx, int ct_usr, const struct ctype* ct) { int i, nargs; cfunction* pf; struct ctype ct2 = *ct; const struct ctype* mt; struct reg_alloc reg; int num_upvals = 0; int top = lua_gettop(L); struct jit* Dst = get_jit(L); int ref; int hidden_arg_off = 0; ct_usr = lua_absindex(L, ct_usr); fidx = lua_absindex(L, fidx); assert(lua_isnil(L, fidx) || lua_isfunction(L, fidx)); memset(®, 0, sizeof(reg)); #ifdef _WIN64 reg.off = 16 + REGISTER_STACK_SPACE(ct); /* stack registers are above the shadow space */ #elif __amd64__ reg.off = 16; #else reg.off = 8; #endif dasm_setup(Dst, build_actionlist); // add a table to store ctype and function upvalues // callback_set assumes the first value is the lua function nargs = (int) lua_rawlen(L, ct_usr); lua_newtable(L); lua_pushvalue(L, -1); ref = luaL_ref(L, LUA_REGISTRYINDEX); if (ct->has_var_arg) { luaL_error(L, "can't create callbacks with varargs"); } // setup a stack frame to hold args for the call into lua_call | push rbp | mov rbp, rsp | push L_ARG | // stack is 4 or 8 (mod 16) (L_ARG, rbp, rip) |.if X64 | // 8 to realign, 16 for return vars, 32 for local calls, rest to save registers | sub rsp, 8 + 16 + 32 + REGISTER_STACK_SPACE(ct) | call ->save_registers |.else | // 4 to realign, 16 for return vars, 32 for local calls, rest to save registers | sub rsp, 4 + 16 + 32 + REGISTER_STACK_SPACE(ct) if (ct->calling_convention == FAST_CALL) { | call ->save_registers } |.endif // hardcode the lua_State* value into the assembly | mov64 L_ARG, L /* get the upval table */ | call_rrr extern lua_rawgeti, L_ARG, LUA_REGISTRYINDEX, ref /* get the lua function */ lua_pushvalue(L, fidx); lua_rawseti(L, -2, ++num_upvals); assert(num_upvals == CALLBACK_FUNC_USR_IDX); | call_rrr extern lua_rawgeti, L_ARG, -1, num_upvals #if !defined _WIN64 && !defined __amd64__ lua_rawgeti(L, ct_usr, 0); mt = (const struct ctype*) lua_touserdata(L, -1); if (!mt->pointers && mt->type == COMPLEX_DOUBLE_TYPE) { hidden_arg_off = reg.off; reg.off += sizeof(void*); } lua_pop(L, 1); #else (void) hidden_arg_off; #endif for (i = 1; i <= nargs; i++) { lua_rawgeti(L, ct_usr, i); mt = (const struct ctype*) lua_touserdata(L, -1); if (mt->pointers) { lua_getuservalue(L, -1); lua_rawseti(L, -3, ++num_upvals); /* usr value */ lua_rawseti(L, -2, ++num_upvals); /* mt */ /* on the lua stack in the callback: * upval tbl, lua func, i-1 args */ | call_rrr extern lua_rawgeti, L_ARG, -i-1, num_upvals-1 | call_rrp extern push_cdata, L_ARG, -1, mt get_pointer(Dst, ct, ®); | mov [rax], rcx | call_rr, extern lua_remove, L_ARG, -2 } else { switch (mt->type) { case INT64_TYPE: lua_getuservalue(L, -1); lua_rawseti(L, -3, ++num_upvals); /* mt */ lua_pop(L, 1); | call_rrp extern push_cdata, L_ARG, 0, mt get_int(Dst, ct, ®, 1); |.if X64 | mov [rax], rcx |.else | mov [rax], ecx | mov [rax+4], edx |.endif break; case INTPTR_TYPE: lua_getuservalue(L, -1); lua_rawseti(L, -3, ++num_upvals); /* mt */ lua_pop(L, 1); | call_rrp extern push_cdata, L_ARG, 0, mt get_pointer(Dst, ct, ®); | mov [rax], rcx break; case COMPLEX_FLOAT_TYPE: lua_pop(L, 1); #if defined _WIN64 || defined __amd64__ /* complex floats are two floats packed into a double */ | call_rrp extern push_cdata, L_ARG, 0, mt get_float(Dst, ct, ®, 1); | movq qword [rax], xmm0 #else /* complex floats are real followed by imag on the stack */ | call_rrp extern push_cdata, L_ARG, 0, mt get_float(Dst, ct, ®, 0); | fstp dword [rax] get_float(Dst, ct, ®, 0); | fstp dword [rax+4] #endif break; case COMPLEX_DOUBLE_TYPE: lua_pop(L, 1); | call_rrp extern push_cdata, L_ARG, 0, mt /* real */ get_float(Dst, ct, ®, 1); |.if X64 | movq qword [rax], xmm0 |.else | fstp qword [rax] |.endif /* imag */ get_float(Dst, ct, ®, 1); |.if X64 | movq qword [rax+8], xmm0 |.else | fstp qword [rax+8] |.endif break; case FLOAT_TYPE: case DOUBLE_TYPE: lua_pop(L, 1); get_float(Dst, ct, ®, mt->type == DOUBLE_TYPE); |.if X64WIN | movq xmm1, xmm0 | mov rcx, L_ARG |.elif X64 | // for 64bit xmm0 is already set | mov rdi, L_ARG |.else | fstp qword [rsp+4] | mov [rsp], L_ARG |.endif | call extern lua_pushnumber break; case BOOL_TYPE: lua_pop(L, 1); get_int(Dst, ct, ®, 0); | movzx ecx, cl | call_rr extern lua_pushboolean, L_ARG, rcx break; case INT8_TYPE: lua_pop(L, 1); get_int(Dst, ct, ®, 0); if (mt->is_unsigned) { | movzx ecx, cl } else { | movsx ecx, cl } | call_rr extern push_int, L_ARG, rcx break; case INT16_TYPE: lua_pop(L, 1); get_int(Dst, ct, ®, 0); if (mt->is_unsigned) { | movzx ecx, cx } else { | movsx ecx, cx } | call_rr extern push_int, L_ARG, rcx break; case ENUM_TYPE: case INT32_TYPE: lua_pop(L, 1); get_int(Dst, ct, ®, 0); if (mt->is_unsigned) { | call_rr extern push_uint, L_ARG, rcx } else { | call_rr extern push_int, L_ARG, rcx } break; default: luaL_error(L, "NYI: callback arg type"); } } } lua_rawgeti(L, ct_usr, 0); mt = (const struct ctype*) lua_touserdata(L, -1); | call_rrrp extern lua_callk, L_ARG, nargs, (mt->pointers || mt->type != VOID_TYPE) ? 1 : 0, 0 // Unpack the return argument if not "void", also clean-up the lua stack // to remove the return argument and bind table. Use lua_settop rather // than lua_pop as lua_pop is implemented as a macro. if (mt->pointers) { lua_getuservalue(L, -1); lua_rawseti(L, -3, ++num_upvals); /* usr value */ lua_rawseti(L, -2, ++num_upvals); /* mt */ | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1 | call_rrrp extern check_typed_pointer, L_ARG, -2, -1, mt | mov [rsp+32], rax | call_rr extern lua_settop, L_ARG, -4 | mov rax, [rsp+32] } else { switch (mt->type) { case ENUM_TYPE: lua_getuservalue(L, -1); lua_rawseti(L, -3, ++num_upvals); /* usr value */ lua_rawseti(L, -2, ++num_upvals); /* mt */ | call_rrr extern lua_rawgeti, L_ARG, -2, num_upvals-1 | call_rrrp, extern check_enum, L_ARG, -2, -1, mt | mov [rsp+32], eax | call_rr extern lua_settop, L_ARG, -4 | mov eax, [rsp+32] break; case VOID_TYPE: lua_pop(L, 1); | call_rr extern lua_settop, L_ARG, -2 break; case BOOL_TYPE: case INT8_TYPE: case INT16_TYPE: case INT32_TYPE: lua_pop(L, 1); if (mt->is_unsigned) { | call_rr extern check_uint32, L_ARG, -1 } else { | call_rr extern check_int32, L_ARG, -1 } | mov [rsp+32], eax | call_rr extern lua_settop, L_ARG, -3 | mov eax, [rsp+32] break; case INT64_TYPE: lua_pop(L, 1); if (mt->is_unsigned) { | call_rr extern check_uint64, L_ARG, -1 } else { | call_rr extern check_int64, L_ARG, -1 } |.if X64 | mov [rsp+32], rax |.else | mov [rsp+32], RET_L | mov [rsp+36], RET_H |.endif | call_rr extern lua_settop, L_ARG, -3 |.if X64 | mov rax, [rsp+32] |.else | mov RET_L, [rsp+32] | mov RET_H, [rsp+36] |.endif break; case INTPTR_TYPE: lua_pop(L, 1); | call_rr extern check_uintptr, L_ARG, -1 | mov [rsp+32], rax | call_rr extern lua_settop, L_ARG, -3 | mov rax, [rsp+32] break; case FLOAT_TYPE: case DOUBLE_TYPE: lua_pop(L, 1); | call_rr extern check_double, L_ARG, -1 |.if X64 | movq qword [rsp+32], xmm0 | call_rr extern lua_settop, L_ARG, -3 if (mt->type == FLOAT_TYPE) { | cvtsd2ss xmm0, qword [rsp+32] } else { | movq xmm0, qword [rsp+32] } |.else | fstp qword [rsp+32] | call_rr extern lua_settop, L_ARG, -3 | fld qword [rsp+32] |.endif break; case COMPLEX_FLOAT_TYPE: lua_pop(L, 1); #if !defined HAVE_COMPLEX luaL_error(L, "ffi lib compiled without complex number support"); #endif /* on 64 bit complex floats are two floats packed into a double, * on 32 bit returned complex floats use eax and edx */ | call_rr extern check_complex_float, L_ARG, -1 | |.if X64 | movq qword [rsp+32], xmm0 |.else | mov [rsp+32], eax | mov [rsp+36], edx |.endif | | call_rr extern lua_settop, L_ARG, -3 | |.if X64 | movq xmm0, qword [rsp+32] |.else | mov eax, [rsp+32] | mov edx, [rsp+36] |.endif break; case COMPLEX_DOUBLE_TYPE: lua_pop(L, 1); #if !defined HAVE_COMPLEX luaL_error(L, "ffi lib compiled without complex number support"); #endif /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit * there is a hidden first parameter that points to 16 bytes where * the returned arg is stored which is popped by the called * function */ #if defined _WIN64 || defined __amd64__ | call_rr extern check_complex_double, L_ARG, -1 | movq qword [rsp+32], xmm0 | movq qword [rsp+40], xmm1 | call_rr extern lua_settop, L_ARG, -3 | movq xmm0, qword [rsp+32] | movq xmm1, qword [rsp+40] #else | mov rcx, [rbp + hidden_arg_off] | call_rrr extern check_complex_double, rcx, L_ARG, -1 | sub rsp, 4 // to realign from popped hidden arg | call_rr extern lua_settop, L_ARG, -3 #endif break; default: luaL_error(L, "NYI: callback return type"); } } |.if X64 | mov L_ARG, [rbp-8] |.else | mov L_ARG, [rbp-4] |.endif | mov rsp, rbp | pop rbp | ret x86_return_size(L, ct_usr, ct) lua_pop(L, 1); /* upval table - already in registry */ assert(lua_gettop(L) == top); ct2.is_jitted = 1; pf = (cfunction*) push_cdata(L, ct_usr, &ct2); *pf = compile(Dst, L, NULL, ref); assert(lua_gettop(L) == top + 1); return *pf; } void compile_function(lua_State* L, cfunction func, int ct_usr, const struct ctype* ct) { size_t i, nargs; int num_upvals; const struct ctype* mbr_ct; struct jit* Dst = get_jit(L); struct reg_alloc reg; void* p; int top = lua_gettop(L); int* perr = &Dst->last_errno; ct_usr = lua_absindex(L, ct_usr); memset(®, 0, sizeof(reg)); reg.off = 32 + REGISTER_STACK_SPACE(ct); dasm_setup(Dst, build_actionlist); p = push_cdata(L, ct_usr, ct); *(cfunction*) p = func; num_upvals = 1; nargs = lua_rawlen(L, ct_usr); if (ct->calling_convention != C_CALL && ct->has_var_arg) { luaL_error(L, "vararg is only allowed with the c calling convention"); } | push rbp | mov rbp, rsp | push L_ARG | push TOP | // stack is 0 (mod 16) (TOP, L_ARG, rbp, rip) | | // Get L from our arguments and allocate some stack for lua_gettop |.if X64WIN | mov L_ARG, rcx | sub rsp, 32 // shadow space |.elif X64 | mov L_ARG, rdi |.else | mov L_ARG, [rbp + 8] | sub rsp, 16 |.endif | | call_r extern lua_gettop, L_ARG | mov TOP, rax // no need for movzxd rax, eax - high word guarenteed to be zero by x86-64 | cmp rax, nargs | jl ->too_few_arguments if (!ct->has_var_arg) { | jg ->too_many_arguments } /* no need to zero extend eax returned by lua_gettop to rax as x86-64 * preguarentees that the upper 32 bits will be zero */ | shl rax, 4 // reserve 16 bytes per argument - this maintains the alignment mod 16 | sub rsp, rax | sub rsp, 32 + REGISTER_STACK_SPACE(ct) // reserve an extra 32 to call local functions #if !defined _WIN64 && !defined __amd64__ /* Returned complex doubles require a hidden first parameter where the * data is stored, which is popped by the calling code. */ lua_rawgeti(L, ct_usr, 0); mbr_ct = (const struct ctype*) lua_touserdata(L, -1); if (!mbr_ct->pointers && mbr_ct->type == COMPLEX_DOUBLE_TYPE) { /* we can allocate more space for arguments as long as no add_* * function has been called yet, mbr_ct will be added as an upvalue in * the return processing later */ | call_rrp extern push_cdata, L_ARG, 0, mbr_ct | sub rsp, 16 add_pointer(Dst, ct, ®); } lua_pop(L, 1); #endif for (i = 1; i <= nargs; i++) { lua_rawgeti(L, ct_usr, (int) i); mbr_ct = (const struct ctype*) lua_touserdata(L, -1); if (mbr_ct->pointers) { lua_getuservalue(L, -1); num_upvals += 2; | call_rrrp extern check_typed_pointer, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct add_pointer(Dst, ct, ®); } else { switch (mbr_ct->type) { case FUNCTION_PTR_TYPE: lua_getuservalue(L, -1); num_upvals += 2; | call_rrrp extern check_typed_cfunction, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct add_pointer(Dst, ct, ®); break; case ENUM_TYPE: lua_getuservalue(L, -1); num_upvals += 2; | call_rrrp, extern check_enum, L_ARG, i, lua_upvalueindex(num_upvals), mbr_ct add_int(Dst, ct, ®, 0); break; case INT8_TYPE: | call_rr extern check_int32, L_ARG, i if (mbr_ct->is_unsigned) { | movzx eax, al } else { | movsx eax, al } add_int(Dst, ct, ®, 0); lua_pop(L, 1); break; case INT16_TYPE: | call_rr extern check_int32, L_ARG, i if (mbr_ct->is_unsigned) { | movzx eax, ax } else { | movsx eax, ax } add_int(Dst, ct, ®, 0); lua_pop(L, 1); break; case BOOL_TYPE: | call_rr extern check_int32, L_ARG, i | cmp eax, 0 | setne al | movzx eax, al add_int(Dst, ct, ®, 0); lua_pop(L, 1); break; case INT32_TYPE: if (mbr_ct->is_unsigned) { | call_rr extern check_uint32, L_ARG, i } else { | call_rr extern check_int32, L_ARG, i } add_int(Dst, ct, ®, 0); lua_pop(L, 1); break; case INTPTR_TYPE: | call_rr extern check_uintptr, L_ARG, i add_pointer(Dst, ct, ®); lua_pop(L, 1); break; case INT64_TYPE: if (mbr_ct->is_unsigned) { | call_rr extern check_uint64, L_ARG, i } else { | call_rr extern check_int64, L_ARG, i } add_int(Dst, ct, ®, 1); lua_pop(L, 1); break; case DOUBLE_TYPE: | call_rr extern check_double, L_ARG, i add_float(Dst, ct, ®, 1); lua_pop(L, 1); break; case COMPLEX_DOUBLE_TYPE: /* on 64 bit, returned complex doubles use xmm0, xmm1, on 32 bit * there is a hidden first parameter that points to 16 bytes where * the returned arg is stored (this is popped by the called * function) */ #if defined _WIN64 || defined __amd64__ | call_rr extern check_complex_double, L_ARG, i add_float(Dst, ct, ®, 1); | movq xmm0, xmm1 add_float(Dst, ct, ®, 1); #else | lea rax, [rsp+reg.off] | sub rsp, 4 | call_rrr extern check_complex_double, rax, L_ARG, i reg.off += 16; #endif lua_pop(L, 1); break; case FLOAT_TYPE: | call_rr extern check_double, L_ARG, i add_float(Dst, ct, ®, 0); lua_pop(L, 1); break; case COMPLEX_FLOAT_TYPE: #if defined _WIN64 || defined __amd64__ | call_rr extern check_complex_float, L_ARG, i /* complex floats are two floats packed into a double */ add_float(Dst, ct, ®, 1); #else /* returned complex floats use eax and edx */ | call_rr extern check_complex_float, L_ARG, i | mov [rsp], eax | fld dword [rsp] add_float(Dst, ct, ®, 0); | mov [rsp], edx | fld dword [rsp] add_float(Dst, ct, ®, 0); #endif lua_pop(L, 1); break; default: luaL_error(L, "NYI: call arg type"); } } } if (ct->has_var_arg) { #ifdef _WIN64 |.if X64WIN if (reg.regs < MAX_REGISTERS(ct)) { assert(reg.regs == nargs); | cmp TOP, MAX_REGISTERS(ct) | jle >1 | // unpack onto stack | mov rax, rsp | add rax, 32 + 8*MAX_REGISTERS(ct) | call_rrrr extern unpack_varargs_stack, L_ARG, MAX_REGISTERS(ct)+1, TOP, rax | // unpack to registers | mov rax, rsp | add rax, 32 + 8*(reg.regs) | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, MAX_REGISTERS(ct), rax | jmp >2 |1: | // unpack just to registers | mov rax, rsp | add rax, 32 + 8*(reg.regs) | call_rrrr extern unpack_varargs_reg, L_ARG, nargs+1, TOP, rax |2: } else { | // unpack just to stack | mov rax, rsp | add rax, reg.off | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax } for (i = nargs; i < MAX_REGISTERS(ct); i++) { reg.is_int[i] = reg.is_float[i] = 1; } reg.regs = MAX_REGISTERS(ct); #elif defined __amd64__ |.elif X64 if (reg.floats < MAX_FLOAT_REGISTERS(ct)) { | mov rax, rsp | add rax, 32 + 8*(MAX_INT_REGISTERS(ct) + reg.floats) | call_rrrrr extern unpack_varargs_float, L_ARG, nargs+1, TOP, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax } if (reg.ints < MAX_INT_REGISTERS(ct)) { | mov rax, rsp | add rax, 32 + 8*(reg.ints) | call_rrrrr extern unpack_varargs_int, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, rax } | mov rax, rsp | add rax, reg.off | call_rrrrrr extern unpack_varargs_stack_skip, L_ARG, nargs+1, TOP, MAX_INT_REGISTERS(ct) - reg.ints, MAX_FLOAT_REGISTERS(ct) - reg.floats, rax reg.floats = MAX_FLOAT_REGISTERS(ct); reg.ints = MAX_INT_REGISTERS(ct); #else |.else | mov rax, rsp | add rax, reg.off | call_rrrr extern unpack_varargs_stack, L_ARG, nargs+1, TOP, rax |.endif #endif } | mov64 rcx, perr | mov eax, dword [rcx] | call_r extern SetLastError, rax /* remove the stack space to call local functions */ |.if X32WIN | add rsp, 28 // SetLastError will have already popped 4 |.else | add rsp, 32 |.endif #ifdef _WIN64 |.if X64WIN switch (reg.regs) { case 4: if (reg.is_float[3]) { | movq xmm3, qword [rsp + 8*3] } if (reg.is_int[3]) { | mov r9, [rsp + 8*3] } case 3: if (reg.is_float[2]) { | movq xmm2, qword [rsp + 8*2] } if (reg.is_int[2]) { | mov r8, [rsp + 8*2] } case 2: if (reg.is_float[1]) { | movq xmm1, qword [rsp + 8*1] } if (reg.is_int[1]) { | mov rdx, [rsp + 8*1] } case 1: if (reg.is_float[0]) { | movq xmm0, qword [rsp] } if (reg.is_int[0]) { | mov rcx, [rsp] } case 0: break; } /* don't remove the space for the registers as we need 32 bytes of register overflow space */ assert(REGISTER_STACK_SPACE(ct) == 32); #elif defined __amd64__ |.elif X64 switch (reg.floats) { case 8: | movq xmm7, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+7)] case 7: | movq xmm6, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+6)] case 6: | movq xmm5, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+5)] case 5: | movq xmm4, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+4)] case 4: | movq xmm3, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+3)] case 3: | movq xmm2, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+2)] case 2: | movq xmm1, qword [rsp + 8*(MAX_INT_REGISTERS(ct)+1)] case 1: | movq xmm0, qword [rsp + 8*(MAX_INT_REGISTERS(ct))] case 0: break; } switch (reg.ints) { case 6: | mov r9, [rsp + 8*5] case 5: | mov r8, [rsp + 8*4] case 4: | mov rcx, [rsp + 8*3] case 3: | mov rdx, [rsp + 8*2] case 2: | mov rsi, [rsp + 8*1] case 1: | mov rdi, [rsp] case 0: break; } | add rsp, REGISTER_STACK_SPACE(ct) #else |.else if (ct->calling_convention == FAST_CALL) { switch (reg.ints) { case 2: | mov edx, [rsp + 4] case 1: | mov ecx, [rsp] case 0: break; } | add rsp, REGISTER_STACK_SPACE(ct) } |.endif #endif #ifdef __amd64__ if (ct->has_var_arg) { /* al stores an upper limit on the number of float register, note that * its allowed to be more than the actual number of float registers used as * long as its 0-8 */ |.if X64 and not X64WIN | mov al, 8 |.endif } #endif | call extern FUNCTION | sub rsp, 48 // 32 to be able to call local functions, 16 so we can store some local variables /* note on windows X86 the stack may be only aligned to 4 (stdcall will * have popped a multiple of 4 bytes), but we don't need 16 byte alignment on * that platform */ lua_rawgeti(L, ct_usr, 0); mbr_ct = (const struct ctype*) lua_touserdata(L, -1); if (mbr_ct->pointers || mbr_ct->type == INTPTR_TYPE) { lua_getuservalue(L, -1); num_upvals += 2; | mov [rsp+32], rax // save the pointer | get_errno | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct | mov rcx, [rsp+32] | mov [rax], rcx // *(void**) cdata = val | jmp ->lua_return_arg } else { switch (mbr_ct->type) { case FUNCTION_PTR_TYPE: lua_getuservalue(L, -1); num_upvals += 2; | mov [rsp+32], rax // save the function pointer | get_errno | call_rrp extern push_cdata, L_ARG, lua_upvalueindex(num_upvals), mbr_ct | mov rcx, [rsp+32] | mov [rax], rcx // *(cfunction**) cdata = val | jmp ->lua_return_arg break; case INT64_TYPE: num_upvals++; | // save the return value |.if X64 | mov [rsp+32], rax |.else | mov [rsp+36], edx // high | mov [rsp+32], eax // low |.endif | | get_errno | call_rrp extern push_cdata, L_ARG, 0, mbr_ct | | // *(int64_t*) cdata = val |.if X64 | mov rcx, [rsp+32] | mov [rax], rcx |.else | mov rcx, [rsp+36] | mov rdx, [rsp+32] | mov [rax+4], rcx | mov [rax], rdx |.endif | | jmp ->lua_return_arg break; case COMPLEX_FLOAT_TYPE: num_upvals++; |.if X64 | // complex floats are returned as two floats packed into xmm0 | movq qword [rsp+32], xmm0 |.else | // complex floats are returned as floats in eax and edx | mov [rsp+32], eax | mov [rsp+36], edx |.endif | | get_errno | call_rrp extern push_cdata, L_ARG, 0, mbr_ct | | // ((complex_float*) cdata) = val |.if X64 | mov rcx, [rsp+32] | mov [rax], rcx |.else | mov ecx, [rsp+32] | mov [rax], ecx | mov ecx, [rsp+36] | mov [rax+4], ecx |.endif | | jmp ->lua_return_arg break; case COMPLEX_DOUBLE_TYPE: num_upvals++; |.if X64 | // complex doubles are returned as xmm0 and xmm1 | movq qword [rsp+40], xmm1 | movq qword [rsp+32], xmm0 | | get_errno | call_rrp extern push_cdata, L_ARG, 0, mbr_ct | | // ((complex_double*) cdata)->real = val0 | // ((complex_double*) cdata)->imag = val1 | mov rcx, [rsp+40] | mov [rax+8], rcx | mov rcx, [rsp+32] | mov [rax], rcx | |.else | // On 32 bit we have already handled this by pushing a new cdata | // and handing the cdata ptr in as the hidden first param, but | // still need to add mbr_ct as an upval as its used earlier. | // Hidden param was popped by called function, we need to realign. | sub rsp, 4 | get_errno |.endif | | jmp ->lua_return_arg break; case VOID_TYPE: lua_pop(L, 1); | jmp ->lua_return_void break; case BOOL_TYPE: lua_pop(L, 1); | jmp ->lua_return_bool break; case INT8_TYPE: lua_pop(L, 1); if (mbr_ct->is_unsigned) { | movzx eax, al } else { | movsx eax, al } | jmp ->lua_return_int break; case INT16_TYPE: lua_pop(L, 1); if (mbr_ct->is_unsigned) { | movzx eax, ax } else { | movsx eax, ax } | jmp ->lua_return_int break; case INT32_TYPE: case ENUM_TYPE: lua_pop(L, 1); if (mbr_ct->is_unsigned) { | jmp ->lua_return_uint } else { | jmp ->lua_return_int } break; case FLOAT_TYPE: lua_pop(L, 1); |.if X64 | cvtss2sd xmm0, xmm0 |.endif | jmp ->lua_return_double break; case DOUBLE_TYPE: lua_pop(L, 1); | jmp ->lua_return_double break; default: luaL_error(L, "NYI: call return type"); } } assert(lua_gettop(L) == top + num_upvals); { cfunction f = compile(Dst, L, func, LUA_NOREF); /* add a callback as an upval so that the jitted code gets cleaned up when * the function gets gc'd */ push_callback(L, f); lua_pushcclosure(L, (lua_CFunction) f, num_upvals+1); } }