Exemplo n.º 1
0
static void
initialize_ldt_struct(our_modify_ldt_t *ldt, void *base, size_t size, uint index)
{
    ASSERT(ldt != NULL);
    ldt->entry_number = index;
    IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint((ptr_uint_t)base)));
    ldt->base_addr = (int)(ptr_int_t) base;
    IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(size)));
    ldt->limit = size;
    ldt->seg_32bit = IF_X64_ELSE(0, 1);
    ldt->contents = MODIFY_LDT_CONTENTS_DATA;
    ldt->read_exec_only = 0;
    ldt->limit_in_pages = (size == GDT_NO_SIZE_LIMIT) ? 1 : 0;
    ldt->seg_not_present = 0;
    /* While linux kernel doesn't care if we set this, vmkernel requires it */
    ldt->useable = 1; /* becomes custom AVL bit */
}
Exemplo n.º 2
0
/* The write that inserts the relative target is done atomically so this
 * function is safe with respect to a thread executing the code containing
 * this target, presuming that the code in both the before and after states
 * is valid.
 * For x64 this routine only works for 32-bit reachability.  If further
 * reach is needed the caller must use indirection.  Xref PR 215395.
 */
byte *
insert_relative_target(byte *pc, cache_pc target, bool hot_patch)
{
	// COMPLETEDD #326 insert_relative_target
	printf("Starting insert_relative_target\n");
    /* insert 4-byte pc-relative offset from the beginning of the next instruction
     */
    int value = (int)(ptr_int_t)(target - pc - 4);
    IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(target - pc - 4)));
    ATOMIC_4BYTE_WRITE(pc, value, hot_patch);
    pc += 4;
    return pc;
}
Exemplo n.º 3
0
void
tls_thread_init(os_local_state_t *os_tls, byte *segment)
{
    /* We have four different ways to obtain TLS, each with its own limitations:
     *
     * 1) Piggyback on the threading system (like we do on Windows): here that would
     *    be pthreads, which uses a segment since at least RH9, and uses gdt-based
     *    segments for NPTL.  The advantage is we won't run out of ldt or gdt entries
     *    (except when the app itself would).  The disadvantage is we're stealing
     *    application slots and we rely on user mode interfaces.
     *
     * 2) Steal an ldt entry via SYS_modify_ldt.  This suffers from the 8K ldt entry
     *    limit and requires that we update manually on a new thread.  For 64-bit
     *    we're limited here to a 32-bit base.  (Strangely, the kernel's
     *    include/asm-x86_64/ldt.h implies that the base is ignored: but it doesn't
     *    seem to be.)
     *
     * 3) Steal a gdt entry via SYS_set_thread_area.  There is a 3rd unused entry
     *    (after pthreads and wine) we could use.  The kernel swaps for us, and with
     *    CLONE_TLS the kernel will set up the entry for a new thread for us.  Xref
     *    PR 192231 and PR 285898.  This system call is disabled on 64-bit 2.6
     *    kernels (though the man page for arch_prctl implies it isn't for 2.5
     *    kernels?!?)
     *
     * 4) Use SYS_arch_prctl.  This is only implemented on 64-bit kernels, and can
     *    only be used to set the gdt entries that fs and gs select for.  Faster to
     *    use <4GB base (obtain with mmap MAP_32BIT) since can use gdt; else have to
     *    use wrmsr.  The man pages say "ARCH_SET_GS is disabled in some kernels".
     */
    uint selector;
    int index = -1;
    int res;
#ifdef X64
    /* First choice is gdt, which means arch_prctl.  Since this may fail
     * on some kernels, we require -heap_in_lower_4GB so we can fall back
     * on modify_ldt.
     */
    byte *cur_gs;
    res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_GET_GS, &cur_gs);
    if (res >= 0) {
        LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: cur gs base is "PFX"\n", cur_gs);
        /* If we're a non-initial thread, gs will be set to the parent thread's value */
        if (cur_gs == NULL || is_dynamo_address(cur_gs) ||
            /* By resolving i#107, we can handle gs conflicts between app and dr. */
            INTERNAL_OPTION(mangle_app_seg)) {
            res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_SET_GS, segment);
            if (res >= 0) {
                os_tls->tls_type = TLS_TYPE_ARCH_PRCTL;
                LOG(GLOBAL, LOG_THREADS, 1,
                    "os_tls_init: arch_prctl successful for base "PFX"\n", segment);
                /* Kernel should have written %gs for us if using GDT */
                if (!dynamo_initialized && read_thread_register(SEG_TLS) == 0) {
                    LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init: using MSR\n");
                    tls_using_msr = true;
                }
                if (IF_CLIENT_INTERFACE_ELSE(INTERNAL_OPTION(private_loader), false)) {
                    res = dynamorio_syscall(SYS_arch_prctl, 2, ARCH_SET_FS,
                                            os_tls->os_seg_info.priv_lib_tls_base);
                    /* Assuming set fs must be successful if set gs succeeded. */
                    ASSERT(res >= 0);
                }
            } else {
                /* we've found a kernel where ARCH_SET_GS is disabled */
                ASSERT_CURIOSITY(false && "arch_prctl failed on set but not get");
                LOG(GLOBAL, LOG_THREADS, 1,
                    "os_tls_init: arch_prctl failed: error %d\n", res);
            }
        } else {
            /* FIXME PR 205276: we don't currently handle it: fall back on ldt, but
             * we'll have the same conflict w/ the selector...
             */
            ASSERT_BUG_NUM(205276, cur_gs == NULL);
        }
    }
#endif

    if (os_tls->tls_type == TLS_TYPE_NONE) {
        /* Second choice is set_thread_area */
        /* PR 285898: if we added CLONE_SETTLS to all clone calls (and emulated vfork
         * with clone) we could avoid having to set tls up for each thread (as well
         * as solve race PR 207903), at least for kernel 2.5.32+.  For now we stick
         * w/ manual setup.
         */
        our_modify_ldt_t desc;

        /* Pick which GDT slots we'll use for DR TLS and for library TLS if
         * using the private loader.
         */
        choose_gdt_slots(os_tls);

        if (tls_gdt_index > -1) {
            /* Now that we know which GDT slot to use, install the per-thread base
             * into it.
             */
            /* Base here must be 32-bit */
            IF_X64(ASSERT(DYNAMO_OPTION(heap_in_lower_4GB) &&
                          segment <= (byte*)UINT_MAX));
            initialize_ldt_struct(&desc, segment, PAGE_SIZE, tls_gdt_index);
            res = dynamorio_syscall(SYS_set_thread_area, 1, &desc);
            LOG(GLOBAL, LOG_THREADS, 3,
                "%s: set_thread_area %d => %d res, %d index\n",
                __FUNCTION__, tls_gdt_index, res, desc.entry_number);
            ASSERT(res < 0 || desc.entry_number == tls_gdt_index);
        } else {
            res = -1;  /* fall back on LDT */
        }

        if (res >= 0) {
            LOG(GLOBAL, LOG_THREADS, 1,
                "os_tls_init: set_thread_area successful for base "PFX" @index %d\n",
                segment, tls_gdt_index);
            os_tls->tls_type = TLS_TYPE_GDT;
            index = tls_gdt_index;
            selector = GDT_SELECTOR(index);
            WRITE_DR_SEG(selector); /* macro needs lvalue! */
        } else {
            IF_VMX86(ASSERT_NOT_REACHED()); /* since no modify_ldt */
            LOG(GLOBAL, LOG_THREADS, 1,
                "os_tls_init: set_thread_area failed: error %d\n", res);
        }

#ifdef CLIENT_INTERFACE
        /* Install the library TLS base. */
        if (INTERNAL_OPTION(private_loader) && res >= 0) {
            app_pc base = os_tls->os_seg_info.priv_lib_tls_base;
            /* lib_tls_gdt_index is picked in choose_gdt_slots. */
            ASSERT(lib_tls_gdt_index >= gdt_entry_tls_min);
            initialize_ldt_struct(&desc, base, GDT_NO_SIZE_LIMIT,
                                  lib_tls_gdt_index);
            res = dynamorio_syscall(SYS_set_thread_area, 1, &desc);
            LOG(GLOBAL, LOG_THREADS, 3,
                "%s: set_thread_area %d => %d res, %d index\n",
                __FUNCTION__, lib_tls_gdt_index, res, desc.entry_number);
            if (res >= 0) {
                /* i558 update lib seg reg to enforce the segment changes */
                selector = GDT_SELECTOR(lib_tls_gdt_index);
                LOG(GLOBAL, LOG_THREADS, 2, "%s: setting %s to selector 0x%x\n",
                    __FUNCTION__, reg_names[LIB_SEG_TLS], selector);
                WRITE_LIB_SEG(selector);
            }
        }
#endif
    }

    if (os_tls->tls_type == TLS_TYPE_NONE) {
        /* Third choice: modify_ldt, which should be available on kernel 2.3.99+ */
        /* Base here must be 32-bit */
        IF_X64(ASSERT(DYNAMO_OPTION(heap_in_lower_4GB) && segment <= (byte*)UINT_MAX));
        /* we have the thread_initexit_lock so no race here */
        index = find_unused_ldt_index();
        selector = LDT_SELECTOR(index);
        ASSERT(index != -1);
        create_ldt_entry((void *)segment, PAGE_SIZE, index);
        os_tls->tls_type = TLS_TYPE_LDT;
        WRITE_DR_SEG(selector); /* macro needs lvalue! */
        LOG(GLOBAL, LOG_THREADS, 1,
            "os_tls_init: modify_ldt successful for base "PFX" w/ index %d\n",
            segment, index);
    }

    os_tls->ldt_index = index;
}
Exemplo n.º 4
0
/* FIXME - like inject_into_thread we assume esp, but we could allocate our
 * own stack in the child and swap to that for transparency. */
bool
inject_into_new_process(HANDLE phandle, char *dynamo_path, bool map,
                        uint inject_location, void *inject_address)
{
    void *hook_target = NULL, *hook_location = NULL;
    uint old_prot; 
    size_t num_bytes_out;
    byte hook_buf[5];

    /* Possible child hook points */
    GET_NTDLL(KiUserApcDispatcher, (IN PVOID Unknown1, 
                                    IN PVOID Unknown2, 
                                    IN PVOID Unknown3, 
                                    IN PVOID ContextStart, 
                                    IN PVOID ContextBody));
    GET_NTDLL(KiUserExceptionDispatcher, (IN PVOID Unknown1, 
                                          IN PVOID Unknown2));

    /* Only ones that work, though I have hopes for KiUserException if can
     * find a better spot to trigger the exception, or we should implement
     * KiUserApc map requirement. */
    ASSERT_NOT_IMPLEMENTED(INJECT_LOCATION_IS_LDR(inject_location));
    switch(inject_location) {
    case INJECT_LOCATION_LdrLoadDll:
    case INJECT_LOCATION_LdrpLoadDll:
    case INJECT_LOCATION_LdrCustom:
    case INJECT_LOCATION_LdrpLoadImportModule:
    case INJECT_LOCATION_LdrDefault:
        /* caller provides the ldr address to use */
        ASSERT(inject_address != NULL);
        hook_location = inject_address;
        if (hook_location == NULL) {
            goto error;
        }
        break;
    case INJECT_LOCATION_KiUserApc:
        hook_location = (void *)KiUserApcDispatcher;
        ASSERT(map);
        break;
    case INJECT_LOCATION_KiUserException:
        hook_location = (void *)KiUserExceptionDispatcher;
        break;
    default:
        ASSERT_NOT_REACHED();
        goto error;
    }

    /* read in code at hook */
    if (!nt_read_virtual_memory(phandle, hook_location, hook_buf,
                                sizeof(hook_buf), &num_bytes_out) ||
        num_bytes_out != sizeof(hook_buf)) {
        goto error;
    }

    if (map) {
        hook_target = NULL; /* for compiler */
        /* NYI see case 102, plan is to remote map in our dll, link and rebase if
         * necessary and hook to a routine in our dll */
        ASSERT_NOT_IMPLEMENTED(false);
    } else {
        byte *remote_code_buffer = NULL, *remote_data_buffer;
        /* max usage for local_buf is for writing the dr library name
         * 2*MAX_PATH (unicode) + sizoef(UNICODE_STRING) + 2, round up to
         * 3*MAX_PATH to be safe */
        byte local_buf[3*MAX_PATH];
        byte *cur_local_pos, *cur_remote_pos, *jmp_fixup1, *jmp_fixup2;
        char *takeover_func = "dynamorio_app_init_and_early_takeover";
        PUNICODE_STRING mod, mod_remote;
        PANSI_STRING func, func_remote;
        int res;
        size_t num_bytes_in;

        GET_NTDLL(LdrLoadDll, (IN PCWSTR PathToFile OPTIONAL,
                               IN PULONG Flags OPTIONAL,
                               IN PUNICODE_STRING ModuleFileName,
                               OUT PHANDLE ModuleHandle));
        GET_NTDLL(LdrGetProcedureAddress, (IN HANDLE ModuleHandle,
                                           IN PANSI_STRING ProcedureName OPTIONAL,
                                           IN ULONG Ordinal OPTIONAL,
                                           OUT FARPROC *ProcedureAddress));
#define GET_PROC_ADDR_BAD_ADDR 0xffbadd11
        GET_NTDLL(NtProtectVirtualMemory, (IN HANDLE ProcessHandle,
                                           IN OUT PVOID *BaseAddress,
                                           IN OUT PULONG ProtectSize,
                                           IN ULONG NewProtect,
                                           OUT PULONG OldProtect));
        GET_NTDLL(NtContinue, (IN PCONTEXT Context,
                               IN BOOLEAN TestAlert));

        /* get buffer for emitted code and data */
        if (!NT_SUCCESS(nt_remote_allocate_virtual_memory(phandle, &remote_code_buffer,
                                                          2*PAGE_SIZE, PAGE_READWRITE,
                                                          MEM_COMMIT))) {
            goto error;
        }
        remote_data_buffer = remote_code_buffer + PAGE_SIZE;
        
        /* write data */
        /* FIXME the two writes are similar (unicode vs ascii), could combine */
        /* First UNICODE_STRING to library */
        cur_remote_pos = remote_data_buffer;
        cur_local_pos = local_buf;
        ASSERT_ROOM(cur_local_pos, local_buf, sizeof(UNICODE_STRING));
        mod = (PUNICODE_STRING)cur_local_pos;
        memset(mod, 0, sizeof(UNICODE_STRING));
        cur_local_pos += sizeof(UNICODE_STRING);
        mod->Buffer = (wchar_t *)(cur_remote_pos + (cur_local_pos - local_buf));
        ASSERT_ROOM(cur_local_pos, local_buf, 2*MAX_PATH+2 /* plus null */);
        res = snwprintf((wchar_t *)cur_local_pos, 2*MAX_PATH, L"%hs", dynamo_path);
        ASSERT(res > 0);
        if (res > 0) {
            cur_local_pos += (2*res);
            ASSERT_TRUNCATE(mod->Length, ushort, 2*res);
            mod->Length = (ushort)(2*res);
            mod->MaximumLength = (ushort)(2*res);
        }
        /* ensure NULL termination, just in case */
        *(wchar_t *)cur_local_pos = L'\0';
        cur_local_pos += sizeof(wchar_t);
        /* write to remote process */
        num_bytes_in = cur_local_pos - local_buf;
        if (!nt_write_virtual_memory(phandle, cur_remote_pos, local_buf,
                                     num_bytes_in, &num_bytes_out) ||
            num_bytes_out != num_bytes_in) {
            goto error;
        }
        mod_remote = (PUNICODE_STRING)cur_remote_pos;
        cur_remote_pos += num_bytes_out;

        /* now write init/takeover func */
        cur_local_pos = local_buf;
        ASSERT_ROOM(cur_local_pos, local_buf, sizeof(ANSI_STRING));
        func = (PANSI_STRING)cur_local_pos;
        memset(func, 0, sizeof(ANSI_STRING));
        cur_local_pos += sizeof(ANSI_STRING);
        func->Buffer = (PCHAR) cur_remote_pos + (cur_local_pos - local_buf);
        ASSERT_ROOM(cur_local_pos, local_buf, strlen(takeover_func)+1);
        strncpy((char *)cur_local_pos, takeover_func, strlen(takeover_func));
        cur_local_pos += strlen(takeover_func);
        ASSERT_TRUNCATE(func->Length, ushort, strlen(takeover_func));
        func->Length = (ushort)strlen(takeover_func);
        func->MaximumLength = (ushort)strlen(takeover_func);
        *cur_local_pos++ = '\0'; /* ensure NULL termination, just in case */
        /* write to remote_process */
        num_bytes_in = cur_local_pos - local_buf;
        if (!nt_write_virtual_memory(phandle, cur_remote_pos, local_buf,
                                     num_bytes_in, &num_bytes_out) ||
            num_bytes_out != num_bytes_in) {
            goto error;
        }
        func_remote = (PANSI_STRING)cur_remote_pos;
        cur_remote_pos += num_bytes_out;
        
        /* now make data page read only */
        res = nt_remote_protect_virtual_memory(phandle, remote_data_buffer, 
                                               PAGE_SIZE, PAGE_READONLY,
                                               &old_prot);
        ASSERT(res);
        
#define INSERT_INT(value)         \
  *(int *)cur_local_pos = value;  \
  cur_local_pos += sizeof(int)

#define PUSH_IMMEDIATE(value)     \
  *cur_local_pos++ = PUSH_IMM32;  \
  INSERT_INT(value)

#define PUSH_SHORT_IMMEDIATE(value)     \
  *cur_local_pos++ = PUSH_IMM8;         \
  *cur_local_pos++ = value

#define MOV_ESP_TO_EAX()                \
  *cur_local_pos++ = MOV_RM32_2_REG32;  \
  *cur_local_pos++ = MOV_ESP_2_EAX_RM

/* FIXME - all values are small use imm8 version */
#define ADD_TO_EAX(value)               \
  *cur_local_pos++ = ADD_EAX_IMM32;     \
  INSERT_INT(value)

#define INSERT_REL32_ADDRESS(target)    \
  IF_X64(ASSERT_NOT_IMPLEMENTED(false)); \
  INSERT_INT((int)(ptr_int_t)((byte *)target - \
                              (((cur_local_pos - local_buf)+4)+cur_remote_pos)))

#define CALL(target_func)               \
  *cur_local_pos++ = CALL_REL32;        \
  INSERT_REL32_ADDRESS(target_func)

/* ecx will hold OldProtection afterwards */
#define PROT_IN_ECX 0xbad15bad /* doesn't match a PAGE_* define */
#define CHANGE_PROTECTION(start, size, new_protection)                \
  *cur_local_pos++ = PUSH_EAX; /* OldProtect slot */                  \
  MOV_ESP_TO_EAX(); /* get &OldProtect */                             \
  IF_X64(ASSERT_NOT_IMPLEMENTED(false));                              \
  PUSH_IMMEDIATE((int)(ALIGN_FORWARD(start+size, PAGE_SIZE) -         \
                 ALIGN_BACKWARD(start, PAGE_SIZE))); /* ProtectSize */ \
  PUSH_IMMEDIATE((int)ALIGN_BACKWARD(start, PAGE_SIZE)); /* BaseAddress */ \
  *cur_local_pos++ = PUSH_EAX; /* arg 5 &OldProtect */                \
  if (new_protection == PROT_IN_ECX) {                                \
      *cur_local_pos++ = PUSH_ECX; /* arg 4 NewProtect */             \
  } else {                                                            \
      PUSH_IMMEDIATE(new_protection); /* arg 4 NewProtect */          \
  }                                                                   \
  ADD_TO_EAX(-4); /* get &ProtectSize */                              \
  *cur_local_pos++ = PUSH_EAX; /* arg 3 &ProtectSize */               \
  ADD_TO_EAX(-4); /* get &BaseAddress */                              \
  *cur_local_pos++ = PUSH_EAX; /* arg 2 &BaseAddress */               \
  PUSH_IMMEDIATE((int)(ptr_int_t)NT_CURRENT_PROCESS); /* arg ProcessHandle */ \
  CALL(NtProtectVirtualMemory);                                       \
  /* no error checking, can't really do anything about it, FIXME */   \
  /* stdcall so just the three slots we made for the ptr arguments    \
   * left on the stack */                                             \
  *cur_local_pos++ = POP_ECX; /* pop BaseAddress */                   \
  *cur_local_pos++ = POP_ECX; /* pop ProtectSize */                   \
  *cur_local_pos++ = POP_ECX /* pop OldProtect into ecx */


        /* write code */
        /* xref case 3821, first call to a possibly hooked routine should be
         * more then 5 bytes into the page, which is satisfied (though is not
         * clear if any hookers would manage to get in first). */
        cur_remote_pos = remote_code_buffer;
        cur_local_pos = local_buf;
        hook_target = cur_remote_pos;
        /* for inject_location INJECT_LOCATION_Ldr* we stick the address used
         * at the start of the code for the child's use */
        if (INJECT_LOCATION_IS_LDR(inject_location)) {
            IF_X64(ASSERT_NOT_IMPLEMENTED(false));
            INSERT_INT((int)(ptr_int_t)inject_address);
            hook_target = cur_remote_pos + 4;  /* skip the address */
        }

#if DEBUG_LOOP
        *cur_local_pos++ = JMP_REL8;
        *cur_local_pos++ = 0xfe;
#endif

        /* save current state */
        *cur_local_pos++ = PUSHA;
        *cur_local_pos++ = PUSHF;

        /* restore trampoline, first make writable */
        CHANGE_PROTECTION(hook_location, 5, PAGE_EXECUTE_READWRITE);
        *cur_local_pos++ = MOV_IMM32_2_RM32; /* restore first 4 bytes of hook */
        *cur_local_pos++ = MOV_IMM_RM_ABS;
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        INSERT_INT((int)(ptr_int_t)hook_location);
        INSERT_INT(*(int *)hook_buf);
        *cur_local_pos++ = MOV_IMM8_2_RM8; /* restore 5th byte of the hook */
        *cur_local_pos++ = MOV_IMM_RM_ABS;
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        INSERT_INT((int)(ptr_int_t)hook_location+4);
        *cur_local_pos++ = hook_buf[4];
        /* hook restored, restore protection */
        CHANGE_PROTECTION(hook_location, 5, PROT_IN_ECX);

        if (inject_location == INJECT_LOCATION_KiUserException) {
            /* Making the first page of the image unreadable triggers an exception
             * to early to use the loader, might try pointing the import table ptr
             * to bad memory instead TOTRY, whatever we do should fixup here */
            ASSERT_NOT_IMPLEMENTED(false);
        }
        
        /* call LdrLoadDll to load dr library */
        *cur_local_pos++ = PUSH_EAX; /* need slot for OUT hmodule*/
        MOV_ESP_TO_EAX();
        *cur_local_pos++ = PUSH_EAX; /* arg 4 OUT *hmodule */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        PUSH_IMMEDIATE((int)(ptr_int_t)mod_remote); /* our library name */
        PUSH_SHORT_IMMEDIATE(0x0); /* Flags OPTIONAL */
        PUSH_SHORT_IMMEDIATE(0x0); /* PathToFile OPTIONAL */
        CALL(LdrLoadDll); /* see signature at decleration above */

        /* stdcall so removed args so top of stack is now the slot containing the
         * returned handle.  Use LdrGetProcedureAddress to get the address of the
         * dr init and takeover function. Is ok to call even if LdrLoadDll failed,
         * so we check for errors afterwards. */
        *cur_local_pos++ = POP_ECX; /* dr module handle */
        *cur_local_pos++ = PUSH_ECX; /* need slot for out ProcedureAddress */
        MOV_ESP_TO_EAX();
        *cur_local_pos++ = PUSH_EAX; /* arg 4 OUT *ProcedureAddress */
        PUSH_SHORT_IMMEDIATE(0x0); /* Ordinal OPTIONAL */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        PUSH_IMMEDIATE((int)(ptr_int_t)func_remote); /* func name */
        *cur_local_pos++ = PUSH_ECX; /* module handle */
        CALL(LdrGetProcedureAddress); /* see signature at decleration above */

        /* Top of stack is now the dr init and takeover function (stdcall removed
         * args). Check for errors and bail (FIXME debug build report somehow?) */
        *cur_local_pos++ = CMP_EAX_IMM32;
        INSERT_INT(STATUS_SUCCESS);
        *cur_local_pos++ = POP_EAX; /* dr init_and_takeover function */
        *cur_local_pos++ = JNZ_REL8; /* FIXME - should check >= 0 instead? */
        jmp_fixup1 = cur_local_pos++; /* jmp to after call below */
        /* Xref case 8373, LdrGetProcedureAdderss sometimes returns an
         * address of 0xffbadd11 even though it returned STATUS_SUCCESS */
        *cur_local_pos++ = CMP_EAX_IMM32;
        INSERT_INT(GET_PROC_ADDR_BAD_ADDR);
        *cur_local_pos++ = JZ_REL8; /* JZ == JE */
        jmp_fixup2 = cur_local_pos++; /* jmp to after call below */
        IF_X64(ASSERT_NOT_IMPLEMENTED(false));
        PUSH_IMMEDIATE((int)(ptr_int_t)remote_code_buffer); /* arg to takeover func */
        PUSH_IMMEDIATE(inject_location); /* arg to takeover func */
        *cur_local_pos++ = CALL_RM32; /* call EAX */
        *cur_local_pos++ = CALL_EAX_RM;
        *cur_local_pos++ = POP_ECX; /* cdecl so pop arg */
        *cur_local_pos++ = POP_ECX; /* cdecl so pop arg */
        /* Now patch the jnz above (if error) to go to here */
        ASSERT_TRUNCATE(*jmp_fixup1, byte, cur_local_pos - (jmp_fixup1+1));
        *jmp_fixup1 = (byte)(cur_local_pos - (jmp_fixup1+1)); /* target of jnz */
        ASSERT_TRUNCATE(*jmp_fixup2, byte, cur_local_pos - (jmp_fixup2+1));
        *jmp_fixup2 = (byte)(cur_local_pos - (jmp_fixup2+1)); /* target of jz */
        *cur_local_pos++ = POPF;
        *cur_local_pos++ = POPA;
        if (inject_location != INJECT_LOCATION_KiUserException) {
            /* jmp back to the hook location to resume execution */
            *cur_local_pos++ = JMP_REL32;
            INSERT_REL32_ADDRESS(hook_location);
        } else {
            /* we triggered the exception, so do an NtContinue back */
            /* see callback.c, esp+4 holds CONTEXT ** */
            *cur_local_pos++ = POP_EAX;  /* EXCEPTION_RECORD ** */
            *cur_local_pos++ = POP_EAX;  /* CONTEXT ** */
            PUSH_SHORT_IMMEDIATE(FALSE); /* arg 2 TestAlert */
            *cur_local_pos++ = MOV_RM32_2_REG32;
            *cur_local_pos++ = MOV_derefEAX_2_EAX_RM; /* CONTEXT * -> EAX */
            *cur_local_pos++ = PUSH_EAX; /* push CONTEXT * (arg 1) */
            CALL(NtContinue);
            /* should never get here, will be zeroed memory so will crash if
             * we do happen to get here, good enough reporting */
        }

        /* Our emitted code above is much less then the sizeof local_buf,
         * but we'll add a check here (after the fact so not robust if really
         * overflowed) that we didn't even come close (someon adding large amounts
         * of code should hit this. FIXME - do better? */
        ASSERT_ROOM(cur_local_pos, local_buf, MAX_PATH);
        num_bytes_in = cur_local_pos - local_buf;
        if (!nt_write_virtual_memory(phandle, cur_remote_pos, local_buf,
                                     num_bytes_in, &num_bytes_out) ||
            num_bytes_out != num_bytes_in) {
            goto error;
        }
        cur_remote_pos += num_bytes_out;
        /* now make code page rx */
        res = nt_remote_protect_virtual_memory(phandle, remote_code_buffer, 
                                               PAGE_SIZE, PAGE_EXECUTE_READ,
                                               &old_prot);
        ASSERT(res);

#undef INSERT_INT
#undef PUSH_IMMEDIATE
#undef PUSH_SHORT_IMMEDIATE
#undef MOV_ESP_TO_EAX
#undef ADD_TO_EAX
#undef INSERT_REL32_ADDRESS
#undef CALL
#undef PROT_IN_ECX
#undef CHANGE_PROTECTION
    }

    /* place hook */
    ASSERT(sizeof(hook_buf) == 5); /* standard 5 byte jmp rel32 hook */
    hook_buf[0] = JMP_REL32;
    IF_X64(ASSERT_NOT_IMPLEMENTED(false));
    *(int *)(&hook_buf[1]) = (int)((byte *)hook_target - ((byte *)hook_location + 5));
    if (!nt_remote_protect_virtual_memory(phandle, hook_location,
                                          sizeof(hook_buf),
                                          PAGE_EXECUTE_READWRITE, &old_prot)) {
        goto error;
    }
    if (!nt_write_virtual_memory(phandle, hook_location, hook_buf,
                                 sizeof(hook_buf), &num_bytes_out) ||
        num_bytes_out != sizeof(hook_buf)) {
        goto error;
    }
    if (!nt_remote_protect_virtual_memory(phandle, hook_location,
                                          sizeof(hook_buf),
                                          old_prot, &old_prot)) {
        goto error;
    }

    return true;

    error:
    /* we do not recover any changes in the child's address space */
    return false;
}