int read_and_verify_dr_marker(HANDLE process, dr_marker_t *marker) { return read_and_verify_dr_marker_common(process, marker, IF_X64_ELSE(true, false)); }
static bool check_architecture(const char *dll, char **argv) { bool is_64bit, also_32bit; if (drfront_is_64bit_app(dll, &is_64bit, &also_32bit) != DRFRONT_SUCCESS) { printf("ERROR: unable to get the architecture infomation of" " the target module %s\n", dll); return false; } if (IF_X64_ELSE(!is_64bit, is_64bit && !also_32bit)) { char *orig_argv0 = argv[0]; char root[MAXIMUM_PATH]; char buf[MAXIMUM_PATH]; char *basename; int errcode; void *inject_data; bool is_readable; if (drfront_get_app_full_path(argv[0], root, BUFFER_SIZE_ELEMENTS(root)) != DRFRONT_SUCCESS) { printf("ERROR: unable to get base dir of %s\n", argv[0]); return false; } basename = root + strlen(root) - 1; while (*basename != DIRSEP && *basename != ALT_DIRSEP && basename > root) basename--; if (basename <= root) { printf("ERROR: unable to get base dir of %s\n", argv[0]); return false; } *basename = '\0'; basename++; _snprintf(buf, BUFFER_SIZE_ELEMENTS(buf) , "%s%c..%c%s%c%s", root, DIRSEP, DIRSEP, IF_X64_ELSE("bin", "bin64"), DIRSEP, basename); NULL_TERMINATE_BUFFER(buf); if (drfront_access(buf, DRFRONT_READ, &is_readable) != DRFRONT_SUCCESS || !is_readable) { printf("ERROR: unable to find frontend %s to match target file bitwidth: " "is this an incomplete installation?\n", buf); } argv[0] = buf; #ifdef UNIX errcode = dr_inject_prepare_to_exec(buf, (const char **)argv, &inject_data); if (errcode == 0 || errcode == WARN_IMAGE_MACHINE_TYPE_MISMATCH_EXE) dr_inject_process_run(inject_data); /* shouldn't return */ printf("ERROR (%d): unable to launch frontend to match target file bitwidth\n", errcode); argv[0] = orig_argv0; return false; #else errcode = dr_inject_process_create(buf, argv, &inject_data); if (errcode == 0 || errcode == WARN_IMAGE_MACHINE_TYPE_MISMATCH_EXE) { dr_inject_process_run(inject_data); /* Wait for the child so user's shell prompt doesn't come back early */ errcode = WaitForSingleObject(dr_inject_get_process_handle(inject_data), INFINITE); if (errcode != WAIT_OBJECT_0) printf("WARNING: failed to wait for cross-arch frontend\n"); dr_inject_process_exit(inject_data, false); argv[0] = orig_argv0; return false; } else { printf("ERROR (%d): unable to launch frontend to match target file bitwidth\n", errcode); argv[0] = orig_argv0; return false; } #endif } return true; }
) { uint references_found = 0; /* only for debugging */ DEBUG_DECLARE(uint references_already_known = 0;) app_pc cur_addr; app_pc last_addr = text_end - sizeof(app_pc); /* inclusive */ LOG(GLOBAL, LOG_RCT, 2, "find_address_references: text["PFX", "PFX"), referto["PFX", "PFX")\n", text_start, text_end, referto_start, referto_end); ASSERT(text_start <= text_end); /* empty ok */ ASSERT(referto_start <= referto_end); /* empty ok */ ASSERT(sizeof(app_pc) == IF_X64_ELSE(8,4)); ASSERT((ptr_uint_t)(last_addr+1) == (((ptr_uint_t)last_addr)+1));/* byte increments */ ASSERT(is_readable_without_exception(text_start, text_end - text_start)); /* FIXME: could try to read dword[pc] dword[pc+4] and then merging them with shifts * and | to get dword[pc+1] dword[pc+2] dword[pc+3] instead of reading memory * but of course only if KSTAT says the latter is indeed faster! */ KSTART(rct_no_reloc); for (cur_addr = text_start; cur_addr <= last_addr; cur_addr++) { DEBUG_DECLARE(bool known_ref = false;) app_pc ref = *(app_pc*)cur_addr; /* note dereference here */ if (rct_check_ref_and_add(dcontext, ref, referto_start, referto_end
/* For 32-bit build, supports looking for x64 marker (in WOW64 process). * For 64-bit build, only supports looking for x64 marker. */ static int read_and_verify_dr_marker_common(HANDLE process, dr_marker_t *marker, bool x64) { byte buf[8]; /* only needs to be 5, but dword pad just in case */ size_t res; void *target = NULL; #if !defined(NOT_DYNAMORIO_CORE) && !defined(NOT_DYNAMORIO_CORE_PROPER) GET_NTDLL(DR_MARKER_HOOKED_FUNCTION, DR_MARKER_HOOKED_FUNCTION_ARGS); void *hook_func = (void *)DR_MARKER_HOOKED_FUNCTION; #else if (IF_X64_ELSE(!x64, x64 && !is_wow64_process(NT_CURRENT_PROCESS))) return DR_MARKER_ERROR; if (x64) { # ifndef X64 uint64 hook_func = get_proc_address_64 (get_module_handle_64(L_DR_MARKER_HOOKED_DLL), DR_MARKER_HOOKED_FUNCTION_STRING); uint64 landing_pad = 0; if (hook_func == 0) return DR_MARKER_ERROR; if (!NT_SUCCESS(nt_wow64_read_virtual_memory64(process, hook_func, buf, 5, &res)) || res != 5) { return DR_MARKER_ERROR; } if (buf[0] != OP_jmp_byte) return DR_MARKER_NOT_FOUND; /* jmp offset + EIP (after jmp = hook_func + size of jmp (5 bytes)) */ /* for 64-bit, the target is stored in front of the trampoline */ landing_pad = *(int *)&buf[1] + hook_func + 5 - 8; if (!NT_SUCCESS(nt_wow64_read_virtual_memory64(process, landing_pad, buf, 8, &res)) || res != 8U) return DR_MARKER_ERROR; /* trampoline address is stored at the top of the landing pad for 64-bit */ target = (void *)PAGE_START(*(ptr_int_t *)buf); } else { # endif /* !X64 */ void *hook_func = (void *)GetProcAddress(GetModuleHandle(DR_MARKER_HOOKED_DLL), DR_MARKER_HOOKED_FUNCTION_STRING); #endif void *landing_pad; if (hook_func == NULL) return DR_MARKER_ERROR; if (!READ_FUNC(process, hook_func, buf, 5, &res) || res != 5) return DR_MARKER_ERROR; if (buf[0] != OP_jmp_byte) return DR_MARKER_NOT_FOUND; /* jmp offset + EIP (after jmp = hook_func + size of jmp (5 bytes)) */ landing_pad = (void *)(*(int *)&buf[1] + (ptr_int_t)hook_func + 5); /* for 64-bit, the target is stored in front of the trampoline */ if (x64) landing_pad = (byte *)landing_pad - 8; /* see emit_landing_pad_code() for layout of landing pad */ if (!READ_FUNC(process, landing_pad, buf, (x64 ? 8 : 5), &res) || res != (x64 ? 8U : 5U)) return DR_MARKER_ERROR; if (x64) { /* trampoline address is stored at the top of the landing pad for 64-bit */ target = (void *)PAGE_START(*(ptr_int_t *)buf); } else { /* jmp offset + EIP (after jmp = landing_pad + size of jmp (5 bytes)) */ target = (void *)PAGE_START(*(int *)&buf[1] + (ptr_int_t)landing_pad + 5); } #if defined(NOT_DYNAMORIO_CORE) || defined(NOT_DYNAMORIO_CORE_PROPER) } #endif if (target == NULL) return DR_MARKER_ERROR; if (!READ_FUNC(process, target, marker, sizeof(dr_marker_t), &res) || res != sizeof(dr_marker_t)) { return DR_MARKER_NOT_FOUND; } if (dr_marker_verify(process, marker)) { return DR_MARKER_FOUND; } return DR_MARKER_NOT_FOUND; /* probably some other hooker */ }
# define GS_TLS 1 /* used in arch_prctl handling */ #else /* Linux GDT layout in x86_32 * 6 - TLS segment #1 0x33 [ glibc's TLS segment ] * 7 - TLS segment #2 0x3b [ Wine's %fs Win32 segment ] * 8 - TLS segment #3 0x43 * FS and GS is not hardcode. */ #endif #define GDT_ENTRY_TLS_MIN_32 6 #define GDT_ENTRY_TLS_MIN_64 12 /* when x86-64 emulate i386, it still use 12-14, so using ifdef x64 * cannot detect the right value. * The actual value will be updated later in os_tls_app_seg_init. */ static uint gdt_entry_tls_min = IF_X64_ELSE(GDT_ENTRY_TLS_MIN_64, GDT_ENTRY_TLS_MIN_32); static bool tls_global_init = false; /* GDT slot we use for set_thread_area. * This depends on the kernel, not on the app! */ static int tls_gdt_index = -1; /* GDT slot we use for private library TLS. */ static int lib_tls_gdt_index = -1; #ifdef X64 static bool tls_using_msr; #endif /* Indicates that on the next request for a GDT entry, we should return the GDT
/* pass non-NULL for thandle if you want this routine to use * Get/SetThreadContext to get the context -- you must still pass * in a pointer to a cxt */ BOOL inject_into_thread(HANDLE phandle, CONTEXT *cxt, HANDLE thandle, char *dynamo_path) { size_t nbytes; BOOL success = FALSE; ptr_uint_t dynamo_entry_esp; ptr_uint_t dynamo_path_esp; LPVOID load_dynamo_code = NULL; /* = base of code allocation */ ptr_uint_t addr; reg_t *bufptr; char buf[MAX_PATH]; uint old_prot; ASSERT(cxt != NULL); #ifndef NOT_DYNAMORIO_CORE_PROPER /* FIXME - if we were early injected we couldn't call inject_init during * startup because kernel32 wasn't loaded yet, so we call it here which * isn't safe because it uses app locks. If we want to support a mix * of early and late follow children injection we should change load_dynamo * to use Nt functions (which we can link) rather then kernel32 functions * (which we have to look up). We could also use module.c code to safely * walk the exports of kernel32.dll (we can cache its mod handle when it * is loaded). */ if (!inject_initialized) { SYSLOG_INTERNAL_WARNING("Using late inject follow children from early injected process, unsafe LdrLock usage"); SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); inject_init(); SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); } #else ASSERT(inject_initialized); #endif /* soon we'll start using alternative injection with case 102 - leaving block */ { reg_t app_xsp; if (thandle != NULL) { /* grab the context of the app's main thread */ cxt->ContextFlags = CONTEXT_DR_STATE; if (!NT_SUCCESS(nt_get_context(thandle, cxt))) { display_error("GetThreadContext failed"); goto error; } } app_xsp = cxt->CXT_XSP; /* copy load_dynamo() into the address space of the new process */ ASSERT(BUFFER_SIZE_BYTES(buf) > SIZE_OF_LOAD_DYNAMO); memcpy(buf, (char*)load_dynamo, SIZE_OF_LOAD_DYNAMO); /* R-X protection is adequate for our non-self modifying code, * and we'll update that after we're done with * nt_write_virtual_memory() calls */ /* get allocation, this will be freed by os_heap_free, so make sure * is compatible allocation method */ if (!NT_SUCCESS(nt_remote_allocate_virtual_memory(phandle, &load_dynamo_code, SIZE_OF_LOAD_DYNAMO, PAGE_EXECUTE_READWRITE, MEMORY_COMMIT))) { display_error("Failed to allocate memory for injection code"); goto error; } if (!nt_write_virtual_memory(phandle, load_dynamo_code, buf, SIZE_OF_LOAD_DYNAMO, &nbytes)) { display_error("WriteMemory failed"); goto error; } /* Xref PR 252745 & PR 252008 - we can use the app's stack to hold our data * even on WOW64 and 64-bit since we're using set context to set xsp. */ /* copy the DYNAMORIO_ENTRY string to the app's stack */ _snprintf(buf, BUFFER_SIZE_ELEMENTS(buf), "%s", DYNAMORIO_ENTRY); NULL_TERMINATE_BUFFER(buf); nbytes = strlen(buf) + 1; // include the trailing '\0' /* keep esp at pointer-sized alignment */ cxt->CXT_XSP -= ALIGN_FORWARD(nbytes, XSP_SZ); dynamo_entry_esp = cxt->CXT_XSP; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, buf, nbytes, &nbytes)) { display_error("WriteMemory failed"); goto error; } /* copy the dynamorio_path string to the app's stack */ _snprintf(buf, BUFFER_SIZE_ELEMENTS(buf), "%s", dynamo_path); NULL_TERMINATE_BUFFER(buf); nbytes = strlen(buf) + 1; // include the trailing '\0' /* keep esp at pointer-sized byte alignment */ cxt->CXT_XSP -= ALIGN_FORWARD(nbytes, XSP_SZ); dynamo_path_esp = cxt->CXT_XSP; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, buf, nbytes, &nbytes)) { display_error("WriteMemory failed"); goto error; } /* copy the current context to the app's stack. Only need the * control registers, so we use a dr_mcontext_t layout. */ bufptr = (reg_t*) buf; *bufptr++ = cxt->CXT_XDI; *bufptr++ = cxt->CXT_XSI; *bufptr++ = cxt->CXT_XBP; *bufptr++ = app_xsp; *bufptr++ = cxt->CXT_XBX; *bufptr++ = cxt->CXT_XDX; *bufptr++ = cxt->CXT_XCX; *bufptr++ = cxt->CXT_XAX; #ifdef X64 *bufptr++ = cxt->R8; *bufptr++ = cxt->R9; *bufptr++ = cxt->R10; *bufptr++ = cxt->R11; *bufptr++ = cxt->R12; *bufptr++ = cxt->R13; *bufptr++ = cxt->R14; *bufptr++ = cxt->R15; #endif /* It would be nice to use preserve_xmm_caller_saved(), but we'd need to * link proc.c and deal w/ messy dependencies to get it into arch_exports.h, * so we do our own check. We go ahead and put in the xmm slots even * if the underlying processor has no xmm support: no harm done. */ if (IF_X64_ELSE(true, is_wow64_process(NT_CURRENT_PROCESS))) { /* PR 264138: preserve xmm0-5. We fill in all slots even though * for 32-bit we don't use them (PR 306394). */ int i, j; for (i = 0; i < NUM_XMM_SLOTS; i++) { for (j = 0; j < IF_X64_ELSE(2,4); j++) { *bufptr++ = CXT_XMM(cxt, i)->reg[j]; } } } else { /* skip xmm slots */ bufptr += XMM_SLOTS_SIZE/sizeof(*bufptr); } *bufptr++ = cxt->CXT_XFLAGS; *bufptr++ = cxt->CXT_XIP; ASSERT((char *)bufptr - (char *)buf == sizeof(dr_mcontext_t)); *bufptr++ = (ptr_uint_t)load_dynamo_code; *bufptr++ = SIZE_OF_LOAD_DYNAMO; nbytes = sizeof(dr_mcontext_t) + 2*sizeof(reg_t); cxt->CXT_XSP -= nbytes; #ifdef X64 /* We need xsp to be aligned prior to each call, but we can only pad * before the context as all later users assume the info they need is * at TOS. */ cxt->CXT_XSP = ALIGN_BACKWARD(cxt->CXT_XSP, XMM_ALIGN); #endif if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, buf, nbytes, &nbytes)) { display_error("WriteMemory failed"); goto error; } /* push the address of the DYNAMORIO_ENTRY string on the app's stack */ cxt->CXT_XSP -= XSP_SZ; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, &dynamo_entry_esp, sizeof(dynamo_entry_esp), &nbytes)) { display_error("WriteMemory failed"); goto error; } /* push the address of GetProcAddress on the app's stack */ ASSERT(addr_getprocaddr); addr = addr_getprocaddr; cxt->CXT_XSP -= XSP_SZ; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, &addr, sizeof(addr), &nbytes)) { display_error("WriteMemory failed"); goto error; } /* push the address of the dynamorio_path string on the app's stack */ cxt->CXT_XSP -= XSP_SZ; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, &dynamo_path_esp, sizeof(dynamo_path_esp), &nbytes)) { display_error("WriteMemory failed"); goto error; } /* push the address of LoadLibraryA on the app's stack */ ASSERT(addr_loadlibrarya); addr = addr_loadlibrarya; cxt->CXT_XSP -= XSP_SZ; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, &addr, sizeof(addr), &nbytes)) { display_error("WriteMemory failed"); goto error; } #ifdef LOAD_DYNAMO_DEBUGBREAK /* push the address of DebugBreak on the app's stack */ ASSERT(addr_debugbreak); addr = addr_debugbreak; cxt->CXT_XSP -= XSP_SZ; if (!nt_write_virtual_memory(phandle, (LPVOID)cxt->CXT_XSP, &addr, sizeof(addr), &nbytes)) { display_error("WriteMemory failed"); goto error; } #endif /* make the code R-X now */ if (!nt_remote_protect_virtual_memory(phandle, load_dynamo_code, SIZE_OF_LOAD_DYNAMO, PAGE_EXECUTE_READ, &old_prot)) { display_error("Failed to make injection code R-X"); goto error; } ASSERT(old_prot == PAGE_EXECUTE_READWRITE); /* now change Eip to point to the entry point of load_dynamo(), so that when we resume, load_dynamo is invoked automatically */ cxt->CXT_XIP = (ptr_uint_t)load_dynamo_code; cxt->CXT_XFLAGS = 0; if (thandle != NULL) { if (!NT_SUCCESS(nt_set_context(thandle, cxt))) { display_error("SetThreadContext failed"); goto error; } } success = TRUE; } error: /* we do not recover any changes in the child's address space */ return success; }
void tls_thread_init(os_local_state_t *os_tls, byte *segment) { /* We have four different ways to obtain TLS, each with its own limitations: * * 1) Piggyback on the threading system (like we do on Windows): here that would * be pthreads, which uses a segment since at least RH9, and uses gdt-based * segments for NPTL. The advantage is we won't run out of ldt or gdt entries * (except when the app itself would). The disadvantage is we're stealing * application slots and we rely on user mode interfaces. * * 2) Steal an ldt entry via SYS_modify_ldt. This suffers from the 8K ldt entry * limit and requires that we update manually on a new thread. For 64-bit * we're limited here to a 32-bit base. (Strangely, the kernel's * include/asm-x86_64/ldt.h implies that the base is ignored: but it doesn't * seem to be.) * * 3) Steal a gdt entry via SYS_set_thread_area. There is a 3rd unused entry * (after pthreads and wine) we could use. The kernel swaps for us, and with * CLONE_TLS the kernel will set up the entry for a new thread for us. Xref * PR 192231 and PR 285898. This system call is disabled on 64-bit 2.6 * kernels (though the man page for arch_prctl implies it isn't for 2.5 * kernels?!?) * * 4) Use SYS_arch_prctl. This is only implemented on 64-bit kernels, and can * only be used to set the gdt entries that fs and gs select for. Faster to * use <4GB base (obtain with mmap MAP_32BIT) since can use gdt; else have to * use wrmsr. The man pages say "ARCH_SET_GS is disabled in some kernels". */ uint selector; int index = -1; int res; #ifdef X64 /* First choice is gdt, which means arch_prctl. Since this may fail * on some kernels, we require -heap_in_lower_4GB so we can fall back * on modify_ldt. */ byte *cur_gs; res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_GET_GS, &cur_gs); if (res >= 0) { LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: cur gs base is "PFX"\n", cur_gs); /* If we're a non-initial thread, gs will be set to the parent thread's value */ if (cur_gs == NULL || is_dynamo_address(cur_gs) || /* By resolving i#107, we can handle gs conflicts between app and dr. */ INTERNAL_OPTION(mangle_app_seg)) { res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_SET_GS, segment); if (res >= 0) { os_tls->tls_type = TLS_TYPE_ARCH_PRCTL; LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: arch_prctl successful for base "PFX"\n", segment); /* Kernel should have written %gs for us if using GDT */ if (!dynamo_initialized && read_thread_register(SEG_TLS) == 0) { LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: using MSR\n"); tls_using_msr = true; } if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) { res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_SET_FS, os_tls->os_seg_info.dr_fs_base); /* Assuming set fs must be successful if set gs succeeded. */ ASSERT(res >= 0); } } else { /* we've found a kernel where ARCH_SET_GS is disabled */ ASSERT_CURIOSITY(false && "arch_prctl failed on set but not get"); LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: arch_prctl failed: error %d\n", res); } } else { /* FIXME PR 205276: we don't currently handle it: fall back on ldt, but * we'll have the same conflict w/ the selector... */ ASSERT_BUG_NUM(205276, cur_gs == NULL); } } #endif if (os_tls->tls_type == TLS_TYPE_NONE) { /* Second choice is set_thread_area */ /* PR 285898: if we added CLONE_SETTLS to all clone calls (and emulated vfork * with clone) we could avoid having to set tls up for each thread (as well * as solve race PR 207903), at least for kernel 2.5.32+. For now we stick * w/ manual setup. */ our_modify_ldt_t desc; /* Pick which GDT slots we'll use for DR TLS and for library TLS if * using the private loader. */ choose_gdt_slots(os_tls); if (tls_gdt_index > -1) { /* Now that we know which GDT slot to use, install the per-thread base * into it. */ /* Base here must be 32-bit */ IF_X64(ASSERT(DYNAMO_OPTION(heap_in_lower_4GB) && segment <= (byte*)UINT_MAX)); initialize_ldt_struct(&desc, segment, PAGE_SIZE, tls_gdt_index); res = dynamorio_syscall(SYS_set_thread_area, 1, &desc); LOG(GLOBAL, LOG_THREADS, 3, "%s: set_thread_area %d => %d res, %d index\n", __FUNCTION__, tls_gdt_index, res, desc.entry_number); ASSERT(res < 0 || desc.entry_number == tls_gdt_index); } else { res = -1; /* fall back on LDT */ } if (res >= 0) { LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: set_thread_area successful for base "PFX" @index %d\n", segment, tls_gdt_index); os_tls->tls_type = TLS_TYPE_GDT; index = tls_gdt_index; selector = GDT_SELECTOR(index); WRITE_DR_SEG(selector); /* macro needs lvalue! */ } else { IF_VMX86(ASSERT_NOT_REACHED()); /* since no modify_ldt */ LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: set_thread_area failed: error %d\n", res); } #ifdef CLIENT_INTERFACE /* Install the library TLS base. */ if (INTERNAL_OPTION(private_loader) && res >= 0) { app_pc base = IF_X64_ELSE(os_tls->os_seg_info.dr_fs_base, os_tls->os_seg_info.dr_gs_base); /* lib_tls_gdt_index is picked in choose_gdt_slots. */ ASSERT(lib_tls_gdt_index >= gdt_entry_tls_min); initialize_ldt_struct(&desc, base, GDT_NO_SIZE_LIMIT, lib_tls_gdt_index); res = dynamorio_syscall(SYS_set_thread_area, 1, &desc); LOG(GLOBAL, LOG_THREADS, 3, "%s: set_thread_area %d => %d res, %d index\n", __FUNCTION__, lib_tls_gdt_index, res, desc.entry_number); if (res >= 0) { /* i558 update lib seg reg to enforce the segment changes */ selector = GDT_SELECTOR(lib_tls_gdt_index); LOG(GLOBAL, LOG_THREADS, 2, "%s: setting %s to selector 0x%x\n", __FUNCTION__, reg_names[LIB_SEG_TLS], selector); WRITE_LIB_SEG(selector); } } #endif } if (os_tls->tls_type == TLS_TYPE_NONE) { /* Third choice: modify_ldt, which should be available on kernel 2.3.99+ */ /* Base here must be 32-bit */ IF_X64(ASSERT(DYNAMO_OPTION(heap_in_lower_4GB) && segment <= (byte*)UINT_MAX)); /* we have the thread_initexit_lock so no race here */ index = find_unused_ldt_index(); selector = LDT_SELECTOR(index); ASSERT(index != -1); create_ldt_entry((void *)segment, PAGE_SIZE, index); os_tls->tls_type = TLS_TYPE_LDT; WRITE_DR_SEG(selector); /* macro needs lvalue! */ LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: modify_ldt successful for base "PFX" w/ index %d\n", segment, index); } os_tls->ldt_index = index; }
/* Queries the set of available GDT slots, and initializes: * - tls_gdt_index * - gdt_entry_tls_min on ia32 * - lib_tls_gdt_index if using private loader * GDT slots are initialized with a base and limit of zero. The caller is * responsible for setting them to a real base. */ static void choose_gdt_slots(os_local_state_t *os_tls) { our_modify_ldt_t desc; int i; int avail_index[GDT_NUM_TLS_SLOTS]; our_modify_ldt_t clear_desc; int res; /* using local static b/c dynamo_initialized is not set for a client thread * when created in client's dr_init routine */ /* FIXME: Could be racy if we have multiple threads initializing during * startup. */ if (tls_global_init) return; tls_global_init = true; /* We don't want to break the assumptions of pthreads or wine, * so we try to take the last slot. We don't want to hardcode * the index b/c the kernel will let us clobber entries so we want * to only pass in -1. */ ASSERT(!dynamo_initialized); ASSERT(tls_gdt_index == -1); for (i = 0; i < GDT_NUM_TLS_SLOTS; i++) avail_index[i] = -1; for (i = 0; i < GDT_NUM_TLS_SLOTS; i++) { /* We use a base and limit of 0 for testing what's available. */ initialize_ldt_struct(&desc, NULL, 0, -1); res = dynamorio_syscall(SYS_set_thread_area, 1, &desc); LOG(GLOBAL, LOG_THREADS, 4, "%s: set_thread_area -1 => %d res, %d index\n", __FUNCTION__, res, desc.entry_number); if (res >= 0) { /* We assume monotonic increases */ avail_index[i] = desc.entry_number; ASSERT(avail_index[i] > tls_gdt_index); tls_gdt_index = desc.entry_number; } else break; } #ifndef X64 /* In x86-64's ia32 emulation, * set_thread_area(6 <= entry_number && entry_number <= 8) fails * with EINVAL (22) because x86-64 only accepts GDT indices 12 to 14 * for TLS entries. */ if (tls_gdt_index > (gdt_entry_tls_min + GDT_NUM_TLS_SLOTS)) gdt_entry_tls_min = GDT_ENTRY_TLS_MIN_64; /* The kernel is x64. */ #endif /* Now give up the earlier slots */ for (i = 0; i < GDT_NUM_TLS_SLOTS; i++) { if (avail_index[i] > -1 && avail_index[i] != tls_gdt_index) { LOG(GLOBAL, LOG_THREADS, 4, "clearing set_thread_area index %d\n", avail_index[i]); clear_ldt_struct(&clear_desc, avail_index[i]); res = dynamorio_syscall(SYS_set_thread_area, 1, &clear_desc); ASSERT(res >= 0); } } #ifndef VMX86_SERVER ASSERT_CURIOSITY(tls_gdt_index == (kernel_is_64bit() ? GDT_64BIT : GDT_32BIT)); #endif #ifdef CLIENT_INTERFACE if (INTERNAL_OPTION(private_loader) && tls_gdt_index != -1) { /* Use the app's selector with our own TLS base for libraries. app_fs * and app_gs are initialized by the caller in os_tls_app_seg_init(). */ int index = SELECTOR_INDEX(IF_X64_ELSE(os_tls->app_fs, os_tls->app_gs)); if (index == 0) { /* An index of zero means the app has no TLS (yet), and happens * during early injection. We use -1 to grab a new entry. When the * app asks for its first table entry with set_thread_area, we give * it this one and emulate its usage of the segment. */ ASSERT_CURIOSITY(DYNAMO_OPTION(early_inject) && "app has " "no TLS, but we used non-early injection"); initialize_ldt_struct(&desc, NULL, 0, -1); res = dynamorio_syscall(SYS_set_thread_area, 1, &desc); LOG(GLOBAL, LOG_THREADS, 4, "%s: set_thread_area -1 => %d res, %d index\n", __FUNCTION__, res, desc.entry_number); ASSERT(res >= 0); if (res >= 0) { return_stolen_lib_tls_gdt = true; index = desc.entry_number; } } lib_tls_gdt_index = index; } #endif }
int main() { int *pc; int release_build = 0; /* 1 == release, 0 == debug */ void *dcontext; int *dstack; int tls_offs; ptr_int_t owning_thread; INIT(); #ifdef UNIX intercept_signal(SIGSEGV, (handler_3_t) signal_handler, false); #else SetUnhandledExceptionFilter((LPTOP_LEVEL_EXCEPTION_FILTER) our_top_handler); #endif #ifdef WINDOWS /* brute force loop over all TLS entries, * and see whether owning_thread is GetCurrentThreadId() * 0:001> dt getdc owning_thread * +0x05c owning_thread : 0xed8 * * 0:001> dt _TEB TLS64 * +0xe10 TLS64 : [64] Ptr32 Void */ for (tls_offs = 63; tls_offs >=0; tls_offs--) { enum {offsetof_TLS64_in_TEB = IF_X64_ELSE(0x1480, 0xe10)}; dcontext_tls_offset = offsetof_TLS64_in_TEB + tls_offs*sizeof(void*); GET_DCONTEXT(dcontext); #if VERBOSE print("%d idx, %x offs\n", tls_offs, dcontext_tls_offset); #endif where = SIGSETJMP(mark); if (where == 0) { owning_thread = *(ptr_int_t *)(((char *)dcontext) + OWNING_THREAD_OFFSET_IN_DCONTEXT); /* we didn't crash reading, is it really thread ID? */ #if VERBOSE print(" %d thread %d vs %d\n", tls_offs, owning_thread, GetCurrentThreadId()); #endif if (owning_thread == GetCurrentThreadId()) { #if VERBOSE print(" %d is dcontext!\n", tls_offs); #endif break; } } else { #if VERBOSE print(" %d crashed\n", tls_offs); #endif /* we crashed reading, try next offset */ } } if (tls_offs < 0) { print("error obtaining dcontext (TLS offset not found): " "are you running natively?!?\n"); exit(1); } #endif where = SIGSETJMP(mark); if (where != 0) { print("error obtaining dcontext (SIGSETJMP failed): " "are you running natively?!?\n"); exit(1); } GET_DCONTEXT(dcontext) #if VERBOSE print("dcontext is "PFX"\n", dcontext); #endif dstack = *(int **)(((char *)dcontext) + DSTACK_OFFSET_IN_DCONTEXT); if (dstack == NULL || !ALIGNED(dstack, PAGE_SIZE)) { print("can't find dstack: old build, or new where dstack offset changed?\n"); while (1) ; exit(-1); } dstack_base = (int *) (((char *)dstack) - DSTACK_SIZE); #if VERBOSE print("dstack is "PFX"-"PFX"\n", dstack_base, dstack); #endif print("dcontext->dstack successfully obtained\n"); where = SIGSETJMP(mark); #if VERBOSE print("SIGSETJMP returned %d\n", where); #endif if (where == 0) { /* if we do the copy in a C loop, trace heads cause us to exit before * we've hit the cxt switch return address, so we crash rather than taking * control -- so we hand-code a copy that in C looks like this: * for (pc = dstack_base; pc++; pc < dstack) * *pc = (int) evil; * we assume df is cleared * FIXME: popf in old fcache_return can trigger a trap crash before * get to ret that goes to evil! * FIXME: I had this getting to evil w/o crashing first, but it's * a little fragile, and on win32 I get issues later b/c we have * trampolines, etc. and so don't completely lose control. * But, in all cases we fail, so whether it's a nice shell code * execution or a crash doesn't matter -- the test does what it's supposed * to do! */ evil_copy(dstack_base, DSTACK_SIZE / sizeof(int), (ptr_int_t)evil); print("wrote to entire dstack without incident!\n"); } else if (where == 1) { print("error writing to "PFX" in expected dstack "PFX"-"PFX"\n", pc, dstack_base, dstack); } else if (where == 2) { print("DR has been cracked! Malicious code is now runnning...\n"); } }