/* output a single import thunk */ static void output_import_thunk( const char *name, const char *table, int pos ) { output( "\n\t.align %d\n", get_alignment(4) ); output( "\t%s\n", func_declaration(name) ); output( "%s\n", asm_globl(name) ); output_cfi( ".cfi_startproc" ); switch(target_cpu) { case CPU_x86: if (!UsePIC) { output( "\tjmp *(%s+%d)\n", table, pos ); } else { output( "\tcall %s\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output( "1:\tjmp *%s+%d-1b(%%eax)\n", table, pos ); } break; case CPU_x86_64: output( "\tjmpq *%s+%d(%%rip)\n", table, pos ); break; case CPU_ARM: output( "\tldr IP,1f\n"); output( "\tldr PC,[PC,IP]\n" ); output( "1:\t.long %s+%u-(1b+4)\n", table, pos ); break; case CPU_ARM64: output( "\tadr x9, 1f\n" ); output( "\tldur x9, [x9, #0]\n" ); if (pos & 0xf000) output( "\tadd x9, x9, #%u\n", pos & 0xf000 ); if (pos & 0x0f00) output( "\tadd x9, x9, #%u\n", pos & 0x0f00 ); if (pos & 0x00f0) output( "\tadd x9, x9, #%u\n", pos & 0x00f0 ); if (pos & 0x000f) output( "\tadd x9, x9, #%u\n", pos & 0x000f ); output( "\tldur x9, [x9, #0]\n" ); output( "\tbr x9\n" ); output( "1:\t.quad %s\n", table ); break; case CPU_POWERPC: output( "\tmr %s, %s\n", ppc_reg(0), ppc_reg(31) ); if (target_platform == PLATFORM_APPLE) { output( "\tlis %s, ha16(%s+%d+32768)\n", ppc_reg(31), table, pos ); output( "\tla %s, lo16(%s+%d)(%s)\n", ppc_reg(31), table, pos, ppc_reg(31) ); } else { output( "\tlis %s, (%s+%d+32768)@h\n", ppc_reg(31), table, pos ); output( "\tla %s, (%s+%d)@l(%s)\n", ppc_reg(31), table, pos, ppc_reg(31) ); } output( "\tlwz %s, 0(%s)\n", ppc_reg(31), ppc_reg(31) ); output( "\tmtctr %s\n", ppc_reg(31) ); output( "\tmr %s, %s\n", ppc_reg(31), ppc_reg(0) ); output( "\tbctr\n" ); break; } output_cfi( ".cfi_endproc" ); output_function_size( name ); }
/* output the get_pc thunk if needed */ void output_get_pc_thunk(void) { if (target_cpu != CPU_x86) return; if (!UsePIC) return; output( "\n\t.text\n" ); output( "\t.align %d\n", get_alignment(4) ); output( "\t%s\n", func_declaration("__wine_spec_get_pc_thunk_eax") ); output( "%s:\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output_cfi( ".cfi_startproc" ); output( "\tmovl (%%esp),%%eax\n" ); output( "\tret\n" ); output_cfi( ".cfi_endproc" ); output_function_size( "__wine_spec_get_pc_thunk_eax" ); }
/* output the delayed import thunks of a Win32 module */ static void output_delayed_import_thunks( const DLLSPEC *spec ) { int i, idx, j, pos, extra_stack_storage = 0; static const char delayed_import_loaders[] = "__wine_spec_delayed_import_loaders"; static const char delayed_import_thunks[] = "__wine_spec_delayed_import_thunks"; if (!nb_delayed) return; output( "\n/* delayed import thunks */\n\n" ); output( "\t.text\n" ); output( "\t.align %d\n", get_alignment(8) ); output( "%s:\n", asm_name(delayed_import_loaders)); output( "\t%s\n", func_declaration("__wine_delay_load_asm") ); output( "%s:\n", asm_name("__wine_delay_load_asm") ); output_cfi( ".cfi_startproc" ); switch(target_cpu) { case CPU_x86: output( "\tpushl %%ecx\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output( "\tpushl %%edx\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output( "\tpushl %%eax\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output( "\tcall %s\n", asm_name("__wine_spec_delay_load") ); output_cfi( ".cfi_adjust_cfa_offset -4" ); output( "\tpopl %%edx\n" ); output_cfi( ".cfi_adjust_cfa_offset -4" ); output( "\tpopl %%ecx\n" ); output_cfi( ".cfi_adjust_cfa_offset -4" ); output( "\tjmp *%%eax\n" ); break; case CPU_x86_64: output( "\tsubq $88,%%rsp\n" ); output_cfi( ".cfi_adjust_cfa_offset 88" ); output( "\tmovq %%rdx,80(%%rsp)\n" ); output( "\tmovq %%rcx,72(%%rsp)\n" ); output( "\tmovq %%r8,64(%%rsp)\n" ); output( "\tmovq %%r9,56(%%rsp)\n" ); output( "\tmovq %%r10,48(%%rsp)\n" ); output( "\tmovq %%r11,40(%%rsp)\n" ); output( "\tmovq %%rax,%%rcx\n" ); output( "\tcall %s\n", asm_name("__wine_spec_delay_load") ); output( "\tmovq 40(%%rsp),%%r11\n" ); output( "\tmovq 48(%%rsp),%%r10\n" ); output( "\tmovq 56(%%rsp),%%r9\n" ); output( "\tmovq 64(%%rsp),%%r8\n" ); output( "\tmovq 72(%%rsp),%%rcx\n" ); output( "\tmovq 80(%%rsp),%%rdx\n" ); output( "\taddq $88,%%rsp\n" ); output_cfi( ".cfi_adjust_cfa_offset -88" ); output( "\tjmp *%%rax\n" ); break; case CPU_SPARC: output( "\tsave %%sp, -96, %%sp\n" ); output( "\tcall %s\n", asm_name("__wine_spec_delay_load") ); output( "\tmov %%g1, %%o0\n" ); output( "\tjmp %%o0\n" ); output( "\trestore\n" ); break; case CPU_ARM: output( "\tstmfd SP!, {r4-r10,FP,LR}\n" ); output( "\tmov LR,PC\n"); output( "\tadd LR,LR,#8\n"); output( "\tldr PC,[PC,#-4]\n"); output( "\t.long %s\n", asm_name("__wine_spec_delay_load") ); output( "\tmov IP,r0\n"); output( "\tldmfd SP!, {r4-r10,FP,LR}\n" ); output( "\tldmfd SP!, {r0-r3}\n" ); output( "\tmov PC,IP\n"); break; case CPU_ARM64: output( "\tstp x29, x30, [sp,#-16]!\n" ); output( "\tmov x29, sp\n" ); output( "\tadr x9, 1f\n" ); output( "\tldur x9, [x9, #0]\n" ); output( "\tblr x9\n" ); output( "\tmov x9, x0\n" ); output( "\tldp x29, x30, [sp],#16\n" ); output( "\tldp x0, x1, [sp,#16]\n" ); output( "\tldp x2, x3, [sp,#32]\n" ); output( "\tldp x4, x5, [sp,#48]\n" ); output( "\tldp x6, x7, [sp],#80\n" ); output( "\tbr x9\n" ); /* or "ret x9" */ output( "1:\t.quad %s\n", asm_name("__wine_spec_delay_load") ); break; case CPU_POWERPC: if (target_platform == PLATFORM_APPLE) extra_stack_storage = 56; /* Save all callee saved registers into a stackframe. */ output( "\tstwu %s, -%d(%s)\n",ppc_reg(1), 48+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(3), 4+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(4), 8+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(5), 12+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(6), 16+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(7), 20+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(8), 24+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(9), 28+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(10),32+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(11),36+extra_stack_storage, ppc_reg(1)); output( "\tstw %s, %d(%s)\n", ppc_reg(12),40+extra_stack_storage, ppc_reg(1)); /* r0 -> r3 (arg1) */ output( "\tmr %s, %s\n", ppc_reg(3), ppc_reg(0)); /* save return address */ output( "\tmflr %s\n", ppc_reg(0)); output( "\tstw %s, %d(%s)\n", ppc_reg(0), 44+extra_stack_storage, ppc_reg(1)); /* Call the __wine_delay_load function, arg1 is arg1. */ output( "\tbl %s\n", asm_name("__wine_spec_delay_load") ); /* Load return value from call into ctr register */ output( "\tmtctr %s\n", ppc_reg(3)); /* restore all saved registers and drop stackframe. */ output( "\tlwz %s, %d(%s)\n", ppc_reg(3), 4+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(4), 8+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(5), 12+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(6), 16+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(7), 20+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(8), 24+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(9), 28+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(10),32+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(11),36+extra_stack_storage, ppc_reg(1)); output( "\tlwz %s, %d(%s)\n", ppc_reg(12),40+extra_stack_storage, ppc_reg(1)); /* Load return value from call into return register */ output( "\tlwz %s, %d(%s)\n", ppc_reg(0), 44+extra_stack_storage, ppc_reg(1)); output( "\tmtlr %s\n", ppc_reg(0)); output( "\taddi %s, %s, %d\n", ppc_reg(1), ppc_reg(1), 48+extra_stack_storage); /* branch to ctr register. */ output( "\tbctr\n"); break; } output_cfi( ".cfi_endproc" ); output_function_size( "__wine_delay_load_asm" ); output( "\n" ); for (i = idx = 0; i < nb_imports; i++) { if (!dll_imports[i]->delay) continue; for (j = 0; j < dll_imports[i]->nb_imports; j++) { ORDDEF *odp = dll_imports[i]->imports[j]; const char *name = odp->name ? odp->name : odp->export_name; output( ".L__wine_delay_imp_%d_%s:\n", i, name ); output_cfi( ".cfi_startproc" ); switch(target_cpu) { case CPU_x86: output( "\tmovl $%d, %%eax\n", (idx << 16) | j ); output( "\tjmp %s\n", asm_name("__wine_delay_load_asm") ); break; case CPU_x86_64: output( "\tmovq $%d,%%rax\n", (idx << 16) | j ); output( "\tjmp %s\n", asm_name("__wine_delay_load_asm") ); break; case CPU_SPARC: output( "\tset %d, %%g1\n", (idx << 16) | j ); output( "\tb,a %s\n", asm_name("__wine_delay_load_asm") ); output( "\tnop\n" ); break; case CPU_ARM: output( "\tstmfd SP!, {r0-r3}\n" ); output( "\tmov r0, #%d\n", idx ); output( "\tmov r1, #16384\n" ); output( "\tmul r1, r0, r1\n" ); output( "\tmov r0, r1\n" ); output( "\tmov r1, #4\n" ); output( "\tmul r1, r0, r1\n" ); output( "\tmov r0, r1\n" ); output( "\tadd r0, #%d\n", j ); output( "\tldr PC,[PC,#-4]\n"); output( "\t.long %s\n", asm_name("__wine_delay_load_asm") ); break; case CPU_ARM64: output( "\tstp x6, x7, [sp,#-80]!\n" ); output( "\tstp x4, x5, [sp,#48]\n" ); output( "\tstp x2, x3, [sp,#32]\n" ); output( "\tstp x0, x1, [sp,#16]\n" ); output( "\tmov x0, #%d\n", idx ); output( "\tmov x1, #16384\n" ); output( "\tmul x1, x0, x1\n" ); output( "\tmov x0, x1\n" ); output( "\tmov x1, #4\n" ); output( "\tmul x1, x0, x1\n" ); output( "\tmov x0, x1\n" ); output( "\tadd x0, x0, #%d\n", j ); output( "\tadr x9, 1f\n" ); output( "\tldur x9, [x9, #0]\n" ); output( "\tbr x9\n" ); output( "1:\t.quad %s\n", asm_name("__wine_delay_load_asm") ); break; case CPU_POWERPC: switch(target_platform) { case PLATFORM_APPLE: /* On Darwin we can use r0 and r2 */ /* Upper part in r2 */ output( "\tlis %s, %d\n", ppc_reg(2), idx); /* Lower part + r2 -> r0, Note we can't use r0 directly */ output( "\taddi %s, %s, %d\n", ppc_reg(0), ppc_reg(2), j); output( "\tb %s\n", asm_name("__wine_delay_load_asm") ); break; default: /* On linux we can't use r2 since r2 is not a scratch register (hold the TOC) */ /* Save r13 on the stack */ output( "\taddi %s, %s, -0x4\n", ppc_reg(1), ppc_reg(1)); output( "\tstw %s, 0(%s)\n", ppc_reg(13), ppc_reg(1)); /* Upper part in r13 */ output( "\tlis %s, %d\n", ppc_reg(13), idx); /* Lower part + r13 -> r0, Note we can't use r0 directly */ output( "\taddi %s, %s, %d\n", ppc_reg(0), ppc_reg(13), j); /* Restore r13 */ output( "\tstw %s, 0(%s)\n", ppc_reg(13), ppc_reg(1)); output( "\taddic %s, %s, 0x4\n", ppc_reg(1), ppc_reg(1)); output( "\tb %s\n", asm_name("__wine_delay_load_asm") ); break; } break; } output_cfi( ".cfi_endproc" ); } idx++; } output_function_size( delayed_import_loaders ); output( "\n\t.align %d\n", get_alignment(get_ptr_size()) ); output( "%s:\n", asm_name(delayed_import_thunks)); for (i = pos = 0; i < nb_imports; i++) { if (!dll_imports[i]->delay) continue; for (j = 0; j < dll_imports[i]->nb_imports; j++, pos += get_ptr_size()) { ORDDEF *odp = dll_imports[i]->imports[j]; output_import_thunk( odp->name ? odp->name : odp->export_name, ".L__wine_delay_IAT", pos ); } } output_function_size( delayed_import_thunks ); }
/* output a single import thunk */ static void output_import_thunk( const char *name, const char *table, int pos ) { output( "\n\t.align %d\n", get_alignment(4) ); output( "\t%s\n", func_declaration(name) ); output( "%s\n", asm_globl(name) ); output_cfi( ".cfi_startproc" ); switch(target_cpu) { case CPU_x86: if (!UsePIC) { output( "\tjmp *(%s+%d)\n", table, pos ); } else { output( "\tcall %s\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output( "1:\tjmp *%s+%d-1b(%%eax)\n", table, pos ); } break; case CPU_x86_64: output( "\tjmpq *%s+%d(%%rip)\n", table, pos ); break; case CPU_SPARC: if ( !UsePIC ) { output( "\tsethi %%hi(%s+%d), %%g1\n", table, pos ); output( "\tld [%%g1+%%lo(%s+%d)], %%g1\n", table, pos ); output( "\tjmp %%g1\n" ); output( "\tnop\n" ); } else { /* Hmpf. Stupid sparc assembler always interprets global variable names as GOT offsets, so we have to do it the long way ... */ output( "\tsave %%sp, -96, %%sp\n" ); output( "0:\tcall 1f\n" ); output( "\tnop\n" ); output( "1:\tsethi %%hi(%s+%d-0b), %%g1\n", table, pos ); output( "\tor %%g1, %%lo(%s+%d-0b), %%g1\n", table, pos ); output( "\tld [%%g1+%%o7], %%g1\n" ); output( "\tjmp %%g1\n" ); output( "\trestore\n" ); } break; case CPU_ARM: output( "\tldr IP,[PC,#0]\n"); output( "\tldr PC,[IP,#%d]\n", pos); output( "\t.long %s\n", table ); break; case CPU_ARM64: output( "\tadr x9, 1f\n" ); output( "\tldur x9, [x9, #0]\n" ); if (pos & 0xf000) output( "\tadd x9, x9, #%u\n", pos & 0xf000 ); if (pos & 0x0f00) output( "\tadd x9, x9, #%u\n", pos & 0x0f00 ); if (pos & 0x00f0) output( "\tadd x9, x9, #%u\n", pos & 0x00f0 ); if (pos & 0x000f) output( "\tadd x9, x9, #%u\n", pos & 0x000f ); output( "\tldur x9, [x9, #0]\n" ); output( "\tbr x9\n" ); output( "1:\t.quad %s\n", table ); break; case CPU_POWERPC: output( "\tmr %s, %s\n", ppc_reg(0), ppc_reg(31) ); if (target_platform == PLATFORM_APPLE) { output( "\tlis %s, ha16(%s+%d+32768)\n", ppc_reg(31), table, pos ); output( "\tla %s, lo16(%s+%d)(%s)\n", ppc_reg(31), table, pos, ppc_reg(31) ); } else { output( "\tlis %s, (%s+%d+32768)@h\n", ppc_reg(31), table, pos ); output( "\tla %s, (%s+%d)@l(%s)\n", ppc_reg(31), table, pos, ppc_reg(31) ); } output( "\tlwz %s, 0(%s)\n", ppc_reg(31), ppc_reg(31) ); output( "\tmtctr %s\n", ppc_reg(31) ); output( "\tmr %s, %s\n", ppc_reg(31), ppc_reg(0) ); output( "\tbctr\n" ); break; } output_cfi( ".cfi_endproc" ); output_function_size( name ); }
/******************************************************************* * output_stubs * * Output the functions for stub entry points */ void output_stubs( DLLSPEC *spec ) { const char *name, *exp_name; int i, count; if (!has_stubs( spec )) return; output( "\n/* stub functions */\n\n" ); output( "\t.text\n" ); for (i = count = 0; i < spec->nb_entry_points; i++) { ORDDEF *odp = &spec->entry_points[i]; if (odp->type != TYPE_STUB) continue; name = get_stub_name( odp, spec ); exp_name = odp->name ? odp->name : odp->export_name; output( "\t.align %d\n", get_alignment(4) ); output( "\t%s\n", func_declaration(name) ); output( "%s:\n", asm_name(name) ); output_cfi( ".cfi_startproc" ); switch (target_cpu) { case CPU_x86: /* flesh out the stub a bit to make safedisc happy */ output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output(" \tnop\n" ); output( "\tsubl $12,%%esp\n" ); output_cfi( ".cfi_adjust_cfa_offset 12" ); if (UsePIC) { output( "\tcall %s\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output( "1:" ); if (exp_name) { output( "\tleal .L%s_string-1b(%%eax),%%ecx\n", name ); output( "\tmovl %%ecx,4(%%esp)\n" ); count++; } else output( "\tmovl $%d,4(%%esp)\n", odp->ordinal ); output( "\tleal .L__wine_spec_file_name-1b(%%eax),%%ecx\n" ); output( "\tmovl %%ecx,(%%esp)\n" ); } else { if (exp_name) { output( "\tmovl $.L%s_string,4(%%esp)\n", name ); count++; } else output( "\tmovl $%d,4(%%esp)\n", odp->ordinal ); output( "\tmovl $.L__wine_spec_file_name,(%%esp)\n" ); } output( "\tcall %s\n", asm_name("__wine_spec_unimplemented_stub") ); break; case CPU_x86_64: output( "\tsubq $8,%%rsp\n" ); output_cfi( ".cfi_adjust_cfa_offset 8" ); output( "\tleaq .L__wine_spec_file_name(%%rip),%%rdi\n" ); if (exp_name) { output( "leaq .L%s_string(%%rip),%%rsi\n", name ); count++; } else output( "\tmovq $%d,%%rsi\n", odp->ordinal ); output( "\tcall %s\n", asm_name("__wine_spec_unimplemented_stub") ); break; case CPU_ARM: output( "\tldr r0,[PC,#0]\n"); output( "\tmov PC,PC\n"); output( "\t.long .L__wine_spec_file_name\n" ); output( "\tldr r1,[PC,#0]\n"); output( "\tmov PC,PC\n"); if (exp_name) { output( "\t.long .L%s_string\n", name ); count++; } else output( "\t.long %d\n", odp->ordinal ); output( "\tbl %s\n", asm_name("__wine_spec_unimplemented_stub") ); break; default: assert(0); } output_cfi( ".cfi_endproc" ); output_function_size( name ); } if (count) { output( "\t%s\n", get_asm_string_section() ); for (i = 0; i < spec->nb_entry_points; i++) { ORDDEF *odp = &spec->entry_points[i]; if (odp->type != TYPE_STUB) continue; exp_name = odp->name ? odp->name : odp->export_name; if (exp_name) { name = get_stub_name( odp, spec ); output( ".L%s_string:\n", name ); output( "\t%s \"%s\"\n", get_asm_string_keyword(), exp_name ); } } } }
/******************************************************************* * BuildCallTo32CBClient * * Call a CBClient relay stub from 32-bit code (KERNEL.620). * * Since the relay stub is itself 32-bit, this should not be a problem; * unfortunately, the relay stubs are expected to switch back to a * 16-bit stack (and 16-bit code) after completion :-( * * This would conflict with our 16- vs. 32-bit stack handling, so * we simply switch *back* to our 32-bit stack before returning to * the caller ... * * The CBClient relay stub expects to be called with the following * 16-bit stack layout, and with ebp and ebx pointing into the 16-bit * stack at the designated places: * * ... * (ebp+14) original arguments to the callback routine * (ebp+10) far return address to original caller * (ebp+6) Thunklet target address * (ebp+2) Thunklet relay ID code * (ebp) BP (saved by CBClientGlueSL) * (ebp-2) SI (saved by CBClientGlueSL) * (ebp-4) DI (saved by CBClientGlueSL) * (ebp-6) DS (saved by CBClientGlueSL) * * ... buffer space used by the 16-bit side glue for temp copies * * (ebx+4) far return address to 16-bit side glue code * (ebx) saved 16-bit ss:sp (pointing to ebx+4) * * The 32-bit side glue code accesses both the original arguments (via ebp) * and the temporary copies prepared by the 16-bit side glue (via ebx). * After completion, the stub will load ss:sp from the buffer at ebx * and perform a far return to 16-bit code. * * To trick the relay stub into returning to us, we replace the 16-bit * return address to the glue code by a cs:ip pair pointing to our * return entry point (the original return address is saved first). * Our return stub thus called will then reload the 32-bit ss:esp and * return to 32-bit code (by using and ss:esp value that we have also * pushed onto the 16-bit stack before and a cs:eip values found at * that position on the 32-bit stack). The ss:esp to be restored is * found relative to the 16-bit stack pointer at: * * (ebx-4) ss (flat) * (ebx-8) sp (32-bit stack pointer) * * The second variant of this routine, CALL32_CBClientEx, which is used * to implement KERNEL.621, has to cope with yet another problem: Here, * the 32-bit side directly returns to the caller of the CBClient thunklet, * restoring registers saved by CBClientGlueSL and cleaning up the stack. * As we have to return to our 32-bit code first, we have to adapt the * layout of our temporary area so as to include values for the registers * that are to be restored, and later (in the implementation of KERNEL.621) * we *really* restore them. The return stub restores DS, DI, SI, and BP * from the stack, skips the next 8 bytes (CBClient relay code / target), * and then performs a lret NN, where NN is the number of arguments to be * removed. Thus, we prepare our temporary area as follows: * * (ebx+22) 16-bit cs (this segment) * (ebx+20) 16-bit ip ('16-bit' return entry point) * (ebx+16) 32-bit ss (flat) * (ebx+12) 32-bit sp (32-bit stack pointer) * (ebx+10) 16-bit bp (points to ebx+24) * (ebx+8) 16-bit si (ignored) * (ebx+6) 16-bit di (ignored) * (ebx+4) 16-bit ds (we actually use the flat DS here) * (ebx+2) 16-bit ss (16-bit stack segment) * (ebx+0) 16-bit sp (points to ebx+4) * * Note that we ensure that DS is not changed and remains the flat segment, * and the 32-bit stack pointer our own return stub needs fits just * perfectly into the 8 bytes that are skipped by the Windows stub. * One problem is that we have to determine the number of removed arguments, * as these have to be really removed in KERNEL.621. Thus, the BP value * that we place in the temporary area to be restored, contains the value * that SP would have if no arguments were removed. By comparing the actual * value of SP with this value in our return stub we can compute the number * of removed arguments. This is then returned to KERNEL.621. * * The stack layout of this function: * (ebp+20) nArgs pointer to variable receiving nr. of args (Ex only) * (ebp+16) esi pointer to caller's esi value * (ebp+12) arg ebp value to be set for relay stub * (ebp+8) func CBClient relay stub address * (ebp+4) ret addr * (ebp) ebp */ static void BuildCallTo32CBClient( int isEx ) { function_header( isEx ? "CALL32_CBClientEx" : "CALL32_CBClient" ); /* Entry code */ output_cfi( ".cfi_startproc" ); output( "\tpushl %%ebp\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output_cfi( ".cfi_rel_offset %%ebp,0" ); output( "\tmovl %%esp,%%ebp\n" ); output_cfi( ".cfi_def_cfa_register %%ebp" ); output( "\tpushl %%edi\n" ); output_cfi( ".cfi_rel_offset %%edi,-4" ); output( "\tpushl %%esi\n" ); output_cfi( ".cfi_rel_offset %%esi,-8" ); output( "\tpushl %%ebx\n" ); output_cfi( ".cfi_rel_offset %%ebx,-12" ); /* Get pointer to temporary area and save the 32-bit stack pointer */ output( "\tmovl 16(%%ebp), %%ebx\n" ); output( "\tleal -8(%%esp), %%eax\n" ); if ( !isEx ) output( "\tmovl %%eax, -8(%%ebx)\n" ); else output( "\tmovl %%eax, 12(%%ebx)\n" ); /* Set up registers and call CBClient relay stub (simulating a far call) */ output( "\tmovl 20(%%ebp), %%esi\n" ); output( "\tmovl (%%esi), %%esi\n" ); output( "\tmovl 8(%%ebp), %%eax\n" ); output( "\tmovl 12(%%ebp), %%ebp\n" ); output( "\tpushl %%cs\n" ); output( "\tcall *%%eax\n" ); /* Return new esi value to caller */ output( "\tmovl 32(%%esp), %%edi\n" ); output( "\tmovl %%esi, (%%edi)\n" ); /* Return argument size to caller */ if ( isEx ) { output( "\tmovl 36(%%esp), %%ebx\n" ); output( "\tmovl %%ebp, (%%ebx)\n" ); } /* Restore registers and return */ output( "\tpopl %%ebx\n" ); output_cfi( ".cfi_same_value %%ebx" ); output( "\tpopl %%esi\n" ); output_cfi( ".cfi_same_value %%esi" ); output( "\tpopl %%edi\n" ); output_cfi( ".cfi_same_value %%edi" ); output( "\tpopl %%ebp\n" ); output_cfi( ".cfi_def_cfa %%esp,4" ); output_cfi( ".cfi_same_value %%ebp" ); output( "\tret\n" ); output_cfi( ".cfi_endproc" ); output_function_size( isEx ? "CALL32_CBClientEx" : "CALL32_CBClient" ); /* '16-bit' return stub */ function_header( isEx ? "CALL32_CBClientEx_Ret" : "CALL32_CBClient_Ret" ); if ( !isEx ) { output( "\tmovzwl %%sp, %%ebx\n" ); output( "\tlssl %%ss:-16(%%ebx), %%esp\n" ); } else { output( "\tmovzwl %%bp, %%ebx\n" ); output( "\tsubw %%bp, %%sp\n" ); output( "\tmovzwl %%sp, %%ebp\n" ); output( "\tlssl %%ss:-12(%%ebx), %%esp\n" ); } output( "\tlret\n" ); output_function_size( isEx ? "CALL32_CBClientEx_Ret" : "CALL32_CBClient_Ret" ); }
/******************************************************************* * BuildCallTo16Core * * This routine builds the core routines used in 32->16 thunks: * * extern DWORD WINAPI wine_call_to_16( FARPROC16 target, DWORD cbArgs, PEXCEPTION_HANDLER handler ); * extern void WINAPI wine_call_to_16_regs( CONTEXT86 *context, DWORD cbArgs, PEXCEPTION_HANDLER handler ); * * These routines can be called directly from 32-bit code. * * All routines expect that the 16-bit stack contents (arguments) and the * return address (segptr to CallTo16_Ret) were already set up by the * caller; nb_args must contain the number of bytes to be conserved. The * 16-bit SS:SP will be set accordingly. * * All other registers are either taken from the CONTEXT86 structure * or else set to default values. The target routine address is either * given directly or taken from the CONTEXT86. */ static void BuildCallTo16Core( int reg_func ) { const char *name = reg_func ? "wine_call_to_16_regs" : "wine_call_to_16"; /* Function header */ function_header( name ); /* Function entry sequence */ output_cfi( ".cfi_startproc" ); output( "\tpushl %%ebp\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output_cfi( ".cfi_rel_offset %%ebp,0" ); output( "\tmovl %%esp, %%ebp\n" ); output_cfi( ".cfi_def_cfa_register %%ebp" ); /* Save the 32-bit registers */ output( "\tpushl %%ebx\n" ); output_cfi( ".cfi_rel_offset %%ebx,-4" ); output( "\tpushl %%esi\n" ); output_cfi( ".cfi_rel_offset %%esi,-8" ); output( "\tpushl %%edi\n" ); output_cfi( ".cfi_rel_offset %%edi,-12" ); output( "\t.byte 0x64\n\tmov %%gs,(%d)\n", GS_OFFSET ); /* Setup exception frame */ output( "\t.byte 0x64\n\tpushl (%d)\n", STACKOFFSET ); output( "\tpushl 16(%%ebp)\n" ); /* handler */ output( "\t.byte 0x64\n\tpushl (0)\n" ); output( "\t.byte 0x64\n\tmovl %%esp,(0)\n" ); /* Call the actual CallTo16 routine (simulate a lcall) */ output( "\tpushl %%cs\n" ); output( "\tcall .L%s\n", name ); /* Remove exception frame */ output( "\t.byte 0x64\n\tpopl (0)\n" ); output( "\taddl $4, %%esp\n" ); output( "\t.byte 0x64\n\tpopl (%d)\n", STACKOFFSET ); if ( !reg_func ) { /* Convert return value */ output( "\tandl $0xffff,%%eax\n" ); output( "\tshll $16,%%edx\n" ); output( "\torl %%edx,%%eax\n" ); } else { /* * Modify CONTEXT86 structure to contain new values * * NOTE: We restore only EAX, EBX, EDX, EDX, EBP, and ESP. * The segment registers as well as ESI and EDI should * not be modified by a well-behaved 16-bit routine in * any case. [If necessary, we could restore them as well, * at the cost of a somewhat less efficient return path.] */ output( "\tmovl 0x14(%%esp),%%edi\n" ); /* FIELD_OFFSET(STACK32FRAME,target) - FIELD_OFFSET(STACK32FRAME,edi) */ /* everything above edi has been popped already */ output( "\tmovl %%eax,0xb0(%%edi)\n"); /* Eax */ output( "\tmovl %%ebx,0xa4(%%edi)\n"); /* Ebx */ output( "\tmovl %%ecx,0xac(%%edi)\n"); /* Ecx */ output( "\tmovl %%edx,0xa8(%%edi)\n"); /* Edx */ output( "\tmovl %%ebp,0xb4(%%edi)\n"); /* Ebp */ output( "\tmovl %%esi,0xc4(%%edi)\n"); /* Esp */ /* The return glue code saved %esp into %esi */ } /* Restore the 32-bit registers */ output( "\tpopl %%edi\n" ); output_cfi( ".cfi_same_value %%edi" ); output( "\tpopl %%esi\n" ); output_cfi( ".cfi_same_value %%esi" ); output( "\tpopl %%ebx\n" ); output_cfi( ".cfi_same_value %%ebx" ); /* Function exit sequence */ output( "\tpopl %%ebp\n" ); output_cfi( ".cfi_def_cfa %%esp,4" ); output_cfi( ".cfi_same_value %%ebp" ); output( "\tret $12\n" ); output_cfi( ".cfi_endproc" ); /* Start of the actual CallTo16 routine */ output( ".L%s:\n", name ); /* Switch to the 16-bit stack */ output( "\tmovl %%esp,%%edx\n" ); output( "\t.byte 0x64\n\tmovw (%d),%%ss\n", STACKOFFSET + 2); output( "\t.byte 0x64\n\tmovw (%d),%%sp\n", STACKOFFSET ); output( "\t.byte 0x64\n\tmovl %%edx,(%d)\n", STACKOFFSET ); /* Make %bp point to the previous stackframe (built by CallFrom16) */ output( "\tmovzwl %%sp,%%ebp\n" ); output( "\tleal 0x2a(%%ebp),%%ebp\n"); /* FIELD_OFFSET(STACK16FRAME,bp) */ /* Add the specified offset to the new sp */ output( "\tsubw 0x2c(%%edx), %%sp\n"); /* FIELD_OFFSET(STACK32FRAME,nb_args) */ if (reg_func) { /* Push the called routine address */ output( "\tmovl 0x28(%%edx),%%edx\n"); /* FIELD_OFFSET(STACK32FRAME,target) */ output( "\tpushw 0xbc(%%edx)\n"); /* SegCs */ output( "\tpushw 0xb8(%%edx)\n"); /* Eip */ /* Get the registers */ output( "\tpushw 0x98(%%edx)\n"); /* SegDs */ output( "\tpushl 0x94(%%edx)\n"); /* SegEs */ output( "\tpopl %%es\n" ); output( "\tpushl 0x90(%%edx)\n"); /* SegFs */ output( "\tpopl %%fs\n" ); output( "\tpushl 0x8c(%%edx)\n"); /* SegGs */ output( "\tpopl %%gs\n" ); output( "\tmovl 0xb4(%%edx),%%ebp\n"); /* Ebp */ output( "\tmovl 0xa0(%%edx),%%esi\n"); /* Esi */ output( "\tmovl 0x9c(%%edx),%%edi\n"); /* Edi */ output( "\tmovl 0xb0(%%edx),%%eax\n"); /* Eax */ output( "\tmovl 0xa4(%%edx),%%ebx\n"); /* Ebx */ output( "\tmovl 0xac(%%edx),%%ecx\n"); /* Ecx */ output( "\tmovl 0xa8(%%edx),%%edx\n"); /* Edx */ /* Get the 16-bit ds */ output( "\tpopw %%ds\n" ); } else /* not a register function */ { /* Push the called routine address */ output( "\tpushl 0x28(%%edx)\n"); /* FIELD_OFFSET(STACK32FRAME,target) */ /* Set %fs and %gs to the value saved by the last CallFrom16 */ output( "\tpushw -22(%%ebp)\n" ); /* FIELD_OFFSET(STACK16FRAME,fs)-FIELD_OFFSET(STACK16FRAME,bp) */ output( "\tpopw %%fs\n" ); output( "\tpushw -20(%%ebp)\n" ); /* FIELD_OFFSET(STACK16FRAME,gs)-FIELD_OFFSET(STACK16FRAME,bp) */ output( "\tpopw %%gs\n" ); /* Set %ds and %es (and %ax just in case) equal to %ss */ output( "\tmovw %%ss,%%ax\n" ); output( "\tmovw %%ax,%%ds\n" ); output( "\tmovw %%ax,%%es\n" ); } /* Jump to the called routine */ output( "\t.byte 0x66\n" ); output( "\tlret\n" ); /* Function footer */ output_function_size( name ); }
/******************************************************************* * output_relay_debug * * Output entry points for relay debugging */ static void output_relay_debug( DLLSPEC *spec ) { int i, j; unsigned int pos, args, flags; /* first the table of entry point offsets */ output( "\t%s\n", get_asm_rodata_section() ); output( "\t.align %d\n", get_alignment(4) ); output( ".L__wine_spec_relay_entry_point_offsets:\n" ); for (i = spec->base; i <= spec->limit; i++) { ORDDEF *odp = spec->ordinals[i]; if (needs_relay( odp )) output( "\t.long .L__wine_spec_relay_entry_point_%d-__wine_spec_relay_entry_points\n", i ); else output( "\t.long 0\n" ); } /* then the table of argument types */ output( "\t.align %d\n", get_alignment(4) ); output( ".L__wine_spec_relay_arg_types:\n" ); for (i = spec->base; i <= spec->limit; i++) { ORDDEF *odp = spec->ordinals[i]; unsigned int mask = 0; if (needs_relay( odp )) { for (j = pos = 0; pos < 16 && j < odp->u.func.nb_args; j++) { switch (odp->u.func.args[j]) { case ARG_STR: mask |= 1 << (2 * pos++); break; case ARG_WSTR: mask |= 2 << (2 * pos++); break; case ARG_INT64: case ARG_DOUBLE: pos += 8 / get_ptr_size(); break; case ARG_INT128: pos += (target_cpu == CPU_x86) ? 4 : 1; break; default: pos++; break; } } } output( "\t.long 0x%08x\n", mask ); } /* then the relay thunks */ output( "\t.text\n" ); output( "__wine_spec_relay_entry_points:\n" ); output( "\tnop\n" ); /* to avoid 0 offset */ for (i = spec->base; i <= spec->limit; i++) { ORDDEF *odp = spec->ordinals[i]; if (!needs_relay( odp )) continue; output( "\t.align %d\n", get_alignment(4) ); output( ".L__wine_spec_relay_entry_point_%d:\n", i ); output_cfi( ".cfi_startproc" ); args = get_args_size(odp) / get_ptr_size(); flags = 0; switch (target_cpu) { case CPU_x86: if (odp->type == TYPE_THISCALL) /* add the this pointer */ { output( "\tpopl %%eax\n" ); output( "\tpushl %%ecx\n" ); output( "\tpushl %%eax\n" ); flags |= 2; } if (odp->flags & FLAG_REGISTER) output( "\tpushl %%eax\n" ); else output( "\tpushl %%esp\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); if (odp->flags & FLAG_RET64) flags |= 1; output( "\tpushl $%u\n", (flags << 24) | (args << 16) | (i - spec->base) ); output_cfi( ".cfi_adjust_cfa_offset 4" ); if (UsePIC) { output( "\tcall %s\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output( "1:\tleal .L__wine_spec_relay_descr-1b(%%eax),%%eax\n" ); } else output( "\tmovl $.L__wine_spec_relay_descr,%%eax\n" ); output( "\tpushl %%eax\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); if (odp->flags & FLAG_REGISTER) { output( "\tcall *8(%%eax)\n" ); } else { output( "\tcall *4(%%eax)\n" ); output_cfi( ".cfi_adjust_cfa_offset -12" ); if (odp->type == TYPE_STDCALL || odp->type == TYPE_THISCALL) output( "\tret $%u\n", args * get_ptr_size() ); else output( "\tret\n" ); } break; case CPU_x86_64: output( "\tsubq $40,%%rsp\n" ); output_cfi( ".cfi_adjust_cfa_offset 40" ); switch (args) { default: output( "\tmovq %%%s,72(%%rsp)\n", is_float_arg( odp, 3 ) ? "xmm3" : "r9" ); /* fall through */ case 3: output( "\tmovq %%%s,64(%%rsp)\n", is_float_arg( odp, 2 ) ? "xmm2" : "r8" ); /* fall through */ case 2: output( "\tmovq %%%s,56(%%rsp)\n", is_float_arg( odp, 1 ) ? "xmm1" : "rdx" ); /* fall through */ case 1: output( "\tmovq %%%s,48(%%rsp)\n", is_float_arg( odp, 0 ) ? "xmm0" : "rcx" ); /* fall through */ case 0: break; } output( "\tleaq 40(%%rsp),%%r8\n" ); output( "\tmovq $%u,%%rdx\n", (flags << 24) | (args << 16) | (i - spec->base) ); output( "\tleaq .L__wine_spec_relay_descr(%%rip),%%rcx\n" ); output( "\tcallq *8(%%rcx)\n" ); output( "\taddq $40,%%rsp\n" ); output_cfi( ".cfi_adjust_cfa_offset -40" ); output( "\tret\n" ); break; default: assert(0); } output_cfi( ".cfi_endproc" ); } }
/******************************************************************* * output_call16_function * * Build a 16-bit-to-Wine callback glue function. * * The generated routines are intended to be used as argument conversion * routines to be called by the CallFrom16... core. Thus, the prototypes of * the generated routines are (see also CallFrom16): * * extern WORD WINAPI __wine_spec_call16_C_xxx( FARPROC func, LPBYTE args ); * extern LONG WINAPI __wine_spec_call16_C_xxx( FARPROC func, LPBYTE args ); * extern void WINAPI __wine_spec_call16_C_xxx_regs( FARPROC func, LPBYTE args, CONTEXT86 *context ); * * where 'C' is the calling convention ('p' for pascal or 'c' for cdecl), * and each 'x' is an argument ('w'=word, 's'=signed word, 'l'=long, * 'p'=linear pointer, 't'=linear pointer to null-terminated string, * 'T'=segmented pointer to null-terminated string). * * The generated routines fetch the arguments from the 16-bit stack (pointed * to by 'args'); the offsets of the single argument values are computed * according to the calling convention and the argument types. Then, the * 32-bit entry point is called with these arguments. * * For register functions, the arguments (if present) are converted just * the same as for normal functions, but in addition the CONTEXT86 pointer * filled with the current register values is passed to the 32-bit routine. */ static void output_call16_function( ORDDEF *odp ) { char *name; int i, pos, stack_words; int argsize = get_function_argsize( odp ); int needs_ldt = (strpbrk( get_args_str( odp ), "pt" ) != NULL); name = strmake( ".L__wine_spec_call16_%s", get_relay_name(odp) ); output( "\t.align %d\n", get_alignment(4) ); output( "\t%s\n", func_declaration(name) ); output( "%s:\n", name ); output_cfi( ".cfi_startproc" ); output( "\tpushl %%ebp\n" ); output_cfi( ".cfi_adjust_cfa_offset 4" ); output_cfi( ".cfi_rel_offset %%ebp,0" ); output( "\tmovl %%esp,%%ebp\n" ); output_cfi( ".cfi_def_cfa_register %%ebp" ); stack_words = 2; if (needs_ldt) { output( "\tpushl %%esi\n" ); output_cfi( ".cfi_rel_offset %%esi,-4" ); stack_words++; if (UsePIC) { output( "\tcall %s\n", asm_name("__wine_spec_get_pc_thunk_eax") ); output( "1:\tmovl wine_ldt_copy_ptr-1b(%%eax),%%esi\n" ); } else output( "\tmovl $%s,%%esi\n", asm_name("_imp__wine_ldt_copy") ); } /* preserve 16-byte stack alignment */ stack_words += odp->u.func.nb_args; for (i = 0; i < odp->u.func.nb_args; i++) if (odp->u.func.args[i] == ARG_DOUBLE || odp->u.func.args[i] == ARG_INT64) stack_words++; if ((odp->flags & FLAG_REGISTER) || (odp->type == TYPE_VARARGS)) stack_words++; if (stack_words % 4) output( "\tsubl $%d,%%esp\n", 16 - 4 * (stack_words % 4) ); if (odp->u.func.nb_args || odp->type == TYPE_VARARGS) output( "\tmovl 12(%%ebp),%%ecx\n" ); /* args */ if (odp->flags & FLAG_REGISTER) { output( "\tpushl 16(%%ebp)\n" ); /* context */ } else if (odp->type == TYPE_VARARGS) { output( "\tleal %d(%%ecx),%%eax\n", argsize ); output( "\tpushl %%eax\n" ); /* va_list16 */ } pos = (odp->type == TYPE_PASCAL) ? 0 : argsize; for (i = odp->u.func.nb_args - 1; i >= 0; i--) { switch (odp->u.func.args[i]) { case ARG_WORD: if (odp->type != TYPE_PASCAL) pos -= 2; output( "\tmovzwl %d(%%ecx),%%eax\n", pos ); output( "\tpushl %%eax\n" ); if (odp->type == TYPE_PASCAL) pos += 2; break; case ARG_SWORD: if (odp->type != TYPE_PASCAL) pos -= 2; output( "\tmovswl %d(%%ecx),%%eax\n", pos ); output( "\tpushl %%eax\n" ); if (odp->type == TYPE_PASCAL) pos += 2; break; case ARG_INT64: case ARG_DOUBLE: if (odp->type != TYPE_PASCAL) pos -= 4; output( "\tpushl %d(%%ecx)\n", pos ); if (odp->type == TYPE_PASCAL) pos += 4; /* fall through */ case ARG_LONG: case ARG_FLOAT: case ARG_SEGPTR: case ARG_SEGSTR: if (odp->type != TYPE_PASCAL) pos -= 4; output( "\tpushl %d(%%ecx)\n", pos ); if (odp->type == TYPE_PASCAL) pos += 4; break; case ARG_PTR: case ARG_STR: case ARG_WSTR: case ARG_INT128: if (odp->type != TYPE_PASCAL) pos -= 4; output( "\tmovzwl %d(%%ecx),%%edx\n", pos + 2 ); /* sel */ output( "\tshr $3,%%edx\n" ); output( "\tmovzwl %d(%%ecx),%%eax\n", pos ); /* offset */ output( "\taddl (%%esi,%%edx,4),%%eax\n" ); output( "\tpushl %%eax\n" ); if (odp->type == TYPE_PASCAL) pos += 4; break; } } output( "\tcall *8(%%ebp)\n" ); if (needs_ldt) { output( "\tmovl -4(%%ebp),%%esi\n" ); output_cfi( ".cfi_same_value %%esi" ); } output( "\tleave\n" ); output_cfi( ".cfi_def_cfa %%esp,4" ); output_cfi( ".cfi_same_value %%ebp" ); output( "\tret\n" ); output_cfi( ".cfi_endproc" ); output_function_size( name ); free( name ); }