mach_error_t mach_inject( const mach_inject_entry threadEntry, const void *paramBlock, size_t paramSize, pid_t targetProcess, vm_size_t stackSize ) { assert( threadEntry ); assert( targetProcess > 0 ); assert( stackSize == 0 || stackSize > 1024 ); // Find the image. const void *image; unsigned long imageSize; unsigned int jumpTableOffset; unsigned int jumpTableSize; mach_error_t err = machImageForPointer( threadEntry, &image, &imageSize, &jumpTableOffset, &jumpTableSize ); // Initialize stackSize to default if requested. if( stackSize == 0 ) /** @bug We only want an 8K default, fix the plop-in-the-middle code below. */ stackSize = 16 * 1024; // Convert PID to Mach Task ref. mach_port_t remoteTask = 0; if( !err ) { err = task_for_pid( mach_task_self(), targetProcess, &remoteTask ); #if defined(__i386__) if (err == 5) fprintf(stderr, "Could not access task for pid %d. You probably need to add user to procmod group\n", targetProcess); #endif } /** @todo Would be nice to just allocate one block for both the remote stack *and* the remoteCode (including the parameter data block once that's written. */ // Allocate the remoteStack. vm_address_t remoteStack = (vm_address_t)NULL; if( !err ) err = vm_allocate( remoteTask, &remoteStack, stackSize, 1 ); // Allocate the code. vm_address_t remoteCode = (vm_address_t)NULL; if( !err ) err = vm_allocate( remoteTask, &remoteCode, imageSize, 1 ); if( !err ) { ASSERT_CAST( pointer_t, image ); #if defined (__ppc__) || defined (__ppc64__) err = vm_write( remoteTask, remoteCode, (pointer_t) image, imageSize ); #elif defined (__i386__) // on intel, jump table use relative jump instructions (jmp), which means // the offset needs to be corrected. We thus copy the image and fix the offset by hand. ptrdiff_t fixUpOffset = (ptrdiff_t) (image - remoteCode); void * fixedUpImage = fixedUpImageFromImage(image, imageSize, jumpTableOffset, jumpTableSize, fixUpOffset); err = vm_write( remoteTask, remoteCode, (pointer_t) fixedUpImage, imageSize ); free(fixedUpImage); #endif } // Allocate the paramBlock if specified. vm_address_t remoteParamBlock = (vm_address_t)NULL; if( !err && paramBlock != NULL && paramSize ) { err = vm_allocate( remoteTask, &remoteParamBlock, paramSize, 1 ); if( !err ) { ASSERT_CAST( pointer_t, paramBlock ); err = vm_write( remoteTask, remoteParamBlock, (pointer_t) paramBlock, paramSize ); } } // Calculate offsets. ptrdiff_t threadEntryOffset, imageOffset; if( !err ) { //assert( (void*)threadEntry >= image && (void*)threadEntry <= (image+imageSize) ); ASSERT_CAST( void*, threadEntry ); threadEntryOffset = ((void*) threadEntry) - image; ASSERT_CAST( void*, remoteCode ); imageOffset = ((void*) remoteCode) - image; } // Allocate the thread. thread_act_t remoteThread; #if defined (__ppc__) || defined (__ppc64__) if( !err ) { ppc_thread_state_t remoteThreadState; /** @bug Stack math should be more sophisticated than this (ala redzone). */ remoteStack += stackSize / 2; bzero( &remoteThreadState, sizeof(remoteThreadState) ); ASSERT_CAST( unsigned int, remoteCode ); remoteThreadState.srr0 = (unsigned int) remoteCode; remoteThreadState.srr0 += threadEntryOffset; assert( remoteThreadState.srr0 < (remoteCode + imageSize) ); ASSERT_CAST( unsigned int, remoteStack ); remoteThreadState.r1 = (unsigned int) remoteStack; ASSERT_CAST( unsigned int, imageOffset ); remoteThreadState.r3 = (unsigned int) imageOffset; ASSERT_CAST( unsigned int, remoteParamBlock ); remoteThreadState.r4 = (unsigned int) remoteParamBlock; ASSERT_CAST( unsigned int, paramSize ); remoteThreadState.r5 = (unsigned int) paramSize; ASSERT_CAST( unsigned int, 0xDEADBEEF ); remoteThreadState.lr = (unsigned int) 0xDEADBEEF; #if 0 printf( "remoteCode start: %p\n", (void*) remoteCode ); printf( "remoteCode size: %ld\n", imageSize ); printf( "remoteCode pc: %p\n", (void*) remoteThreadState.srr0 ); printf( "remoteCode end: %p\n", (void*) (((char*)remoteCode)+imageSize) ); fflush(0); #endif err = thread_create_running( remoteTask, PPC_THREAD_STATE, (thread_state_t) &remoteThreadState, PPC_THREAD_STATE_COUNT, &remoteThread ); }
/* * Set up the initial state of a MACH thread */ void _pthread_setup(pthread_t thread, void (*routine)(pthread_t), void *vsp, int suspended, int needresume) { #if defined(__i386__) i386_thread_state_t state = {0}; thread_state_flavor_t flavor = x86_THREAD_STATE32; mach_msg_type_number_t count = i386_THREAD_STATE_COUNT; #elif defined(__x86_64__) x86_thread_state64_t state = {0}; thread_state_flavor_t flavor = x86_THREAD_STATE64; mach_msg_type_number_t count = x86_THREAD_STATE64_COUNT; #elif defined(__arm__) arm_thread_state_t state = {0}; thread_state_flavor_t flavor = ARM_THREAD_STATE; mach_msg_type_number_t count = ARM_THREAD_STATE_COUNT; #else #error _pthread_setup not defined for this architecture #endif if (suspended) { (void)thread_get_state(_pthread_kernel_thread(thread), flavor, (thread_state_t)&state, &count); } #if defined(__i386__) uintptr_t *sp = vsp; state.__eip = (uintptr_t)routine; // We need to simulate a 16-byte aligned stack frame as if we had // executed a call instruction. Since we're "pushing" one argument, // we need to adjust the pointer by 12 bytes (3 * sizeof (int *)) sp -= 3; // make sure stack is aligned *--sp = (uintptr_t)thread; // argument to function *--sp = 0; // fake return address state.__esp = (uintptr_t)sp; // set stack pointer #elif defined(__x86_64__) uintptr_t *sp = vsp; state.__rip = (uintptr_t)routine; // We need to simulate a 16-byte aligned stack frame as if we had // executed a call instruction. The stack should already be aligned // before it comes to us and we don't need to push any arguments, // so we shouldn't need to change it. state.__rdi = (uintptr_t)thread; // argument to function *--sp = 0; // fake return address state.__rsp = (uintptr_t)sp; // set stack pointer #elif defined(__arm__) state.__pc = (uintptr_t)routine; // Detect switch to thumb mode. if (state.__pc & 1) { state.__pc &= ~1; state.__cpsr |= 0x20; /* PSR_THUMB */ } state.__sp = (uintptr_t)vsp - C_ARGSAVE_LEN - C_RED_ZONE; state.__r[0] = (uintptr_t)thread; #else #error _pthread_setup not defined for this architecture #endif if (suspended) { (void)thread_set_state(_pthread_kernel_thread(thread), flavor, (thread_state_t)&state, count); if (needresume) { (void)thread_resume(_pthread_kernel_thread(thread)); } } else { mach_port_t kernel_thread; (void)thread_create_running(mach_task_self(), flavor, (thread_state_t)&state, count, &kernel_thread); _pthread_set_kernel_thread(thread, kernel_thread); } }
mach_error_t mach_inject( const mach_inject_entry threadEntry, const void *paramBlock, size_t paramSize, pid_t targetProcess, vm_size_t stackSize ) { ;//assertCodePtr( threadEntry ); ;//assertPtrIfNotNull( paramBlock ); ;//assertPositive( targetProcess ); ;//assertIsTrue( stackSize == 0 || stackSize > 1024 ); // Find the image. const void *image; unsigned long imageSize; mach_error_t err = machImageForPointer( threadEntry, &image, &imageSize ); // Initialize stackSize to default if requested. if( stackSize == 0 ) /** @bug We only want an 8K default, fix the plop-in-the-middle code below. */ stackSize = 16 * 1024; // Convert PID to Mach Task ref. mach_port_t remoteTask = 0; if( !err ) err = task_for_pid( mach_task_self(), targetProcess, &remoteTask ); /** @todo Would be nice to just allocate one block for both the remote stack *and* the remoteCode (including the parameter data block once that's written. */ // Allocate the remoteStack. vm_address_t remoteStack = 0; if( !err ) err = vm_allocate( remoteTask, &remoteStack, stackSize, 1 ); // Allocate the code. vm_address_t remoteCode = 0; if( !err ) err = vm_allocate( remoteTask, &remoteCode, imageSize, 1 ); if( !err ) { ASSERT_CAST( pointer_t, image ); err = vm_write( remoteTask, remoteCode, (pointer_t) image, imageSize ); } // Allocate the paramBlock if specified. vm_address_t remoteParamBlock = 0; if( !err && paramBlock != NULL && paramSize ) { err = vm_allocate( remoteTask, &remoteParamBlock, paramSize, 1 ); if( !err ) { ASSERT_CAST( pointer_t, paramBlock ); err = vm_write( remoteTask, remoteParamBlock, (pointer_t) paramBlock, paramSize ); } } // Calculate offsets. ptrdiff_t threadEntryOffset, imageOffset; if( !err ) { ;//assertIsWithinRange( threadEntry, image, image+imageSize ); ASSERT_CAST( void*, threadEntry ); threadEntryOffset = ((void*) threadEntry) - image; ASSERT_CAST( void*, remoteCode ); imageOffset = ((void*) remoteCode) - image; } // Allocate the thread. thread_act_t remoteThread; if( !err ) { ppc_thread_state_t remoteThreadState; /** @bug Stack math should be more sophisticated than this (ala redzone). */ remoteStack += stackSize / 2; bzero( &remoteThreadState, sizeof(remoteThreadState) ); ASSERT_CAST( unsigned int, remoteCode ); remoteThreadState.srr0 = (unsigned int) remoteCode; remoteThreadState.srr0 += threadEntryOffset; assert( remoteThreadState.srr0 < (remoteCode + imageSize) ); ASSERT_CAST( unsigned int, remoteStack ); remoteThreadState.r1 = (unsigned int) remoteStack; ASSERT_CAST( unsigned int, imageOffset ); remoteThreadState.r3 = (unsigned int) imageOffset; ASSERT_CAST( unsigned int, remoteParamBlock ); remoteThreadState.r4 = (unsigned int) remoteParamBlock; ASSERT_CAST( unsigned int, paramSize ); remoteThreadState.r5 = (unsigned int) paramSize; ASSERT_CAST( unsigned int, 0xDEADBEEF ); remoteThreadState.lr = (unsigned int) 0xDEADBEEF; //printf( "remoteCode start: %p\n", (void*) remoteCode ); //printf( "remoteCode size: %ld\n", imageSize ); //printf( "remoteCode pc: %p\n", (void*) remoteThreadState.srr0 ); //printf( "remoteCode end: %p\n", (void*) (((char*)remoteCode)+imageSize) ); fflush(0); err = thread_create_running( remoteTask, PPC_THREAD_STATE, (thread_state_t) &remoteThreadState, PPC_THREAD_STATE_COUNT, &remoteThread ); }
int main(int argc, const char *argv[]) { if (argc < 3) return -1; kern_return_t ret; vm_address_t r_libname; vm_address_t stack; vm_address_t code; thread_t thread; x86_thread_state64_t state; mach_port_t task; const char *libname = argv[1]; unsigned long long stackContents[5], stack_size, i; // the stack contents has to be an odd number of ull's for some reason (some aligning issue) in dlopen unsigned char codeContents[38]; bzero(codeContents, sizeof(codeContents)); bzero(stackContents, sizeof(stackContents)); codeContents[0] = 0x55; // push rbp codeContents[1] = 0x48; codeContents[2] = 0x89; codeContents[3] = 0xe5; // mov %rsp, %rbp codeContents[4] = 0x48; codeContents[5] = 0xbf; // mov r_libname, %rdi stackContents[1] = (unsigned long long)dlopen; stackContents[2] = (unsigned long long)mach_thread_self; stackContents[4] = (unsigned long long)thread_suspend; stack_size = 65536; if (strcmp(argv[2], "self") == 0) task = mach_task_self(); else ENSURE_SUCCESS(task_for_pid(mach_task_self(), atoi(argv[2]), &task)); ENSURE_SUCCESS(vm_allocate(task, &r_libname, strlen(libname) + 1, true)); ENSURE_SUCCESS(vm_allocate(task, &stack, stack_size, true)); ENSURE_SUCCESS(vm_allocate(task, &code, sizeof(codeContents), true)); stackContents[0] = code; stackContents[3] = (unsigned long long)code + 27; ENSURE_SUCCESS(vm_write(task, r_libname, (vm_offset_t)libname, strlen(libname) + 1)); ENSURE_SUCCESS(vm_write(task, stack + stack_size-sizeof(stackContents), (vm_offset_t)stackContents, sizeof(stackContents))); memcpy(&codeContents[6], &r_libname, sizeof(unsigned long long)); codeContents[14] = 0x48; codeContents[15] = 0xbe; codeContents[16] = 0x2; // mov 0x2, %rsi codeContents[24] = 0x5d; // pop %rbp codeContents[25] = 0x90; // nop / int 3 depending if im debugging codeContents[26] = 0xc3; // ret codeContents[27] = 0x48; codeContents[28] = 0x89; codeContents[29] = 0xc7; codeContents[30] = 0xc3; ENSURE_SUCCESS(vm_write(task, code, (vm_offset_t)codeContents, sizeof(codeContents))); ENSURE_SUCCESS(vm_protect(task, code, sizeof(codeContents), false, VM_PROT_EXECUTE | VM_PROT_READ)); printf("Created code region at %p:\n", (void *)code); for (i = 0; i < sizeof(codeContents); i++) { printf("0x%02x ", codeContents[i]); } puts(""); printf("Created stack at %p with top of stack at %p\n", (void*)stack, (void*)(stack + stack_size)); for (i = 0; i < sizeof(stackContents) / sizeof(stackContents[0]); i++) { printf("0x%02llx:\t0x%02llx\n", (stack + stack_size - sizeof(stackContents) + (i * sizeof(unsigned long long))), stackContents[i]); } bzero(&state, sizeof(state)); state.__rip = (uint64_t)dlsym(RTLD_DEFAULT, "_pthread_set_self"); state.__rdi = stack; state.__rsp = stack + stack_size-sizeof(stackContents); // end of stack minus returns state.__rbp = state.__rsp; printf("Found _pthread_set_self at %p\n", (void *)state.__rip); ENSURE_SUCCESS(thread_create_running(task, x86_THREAD_STATE64, (thread_state_t)(&state), x86_THREAD_STATE64_COUNT, &thread)); if (strcmp(argv[2], "self") == 0) { int rv = pthread_join(*(pthread_t *)stack, NULL); if (rv) { fprintf(stderr, "pthread_join: (%d) %s\n", rv, strerror(rv)); sleep(1); // let the dylib actually load in the other thread, it wouldn't appear that there exists mach thread waiting, and I'm too lazy to create a semaphore and using the value of `stack` for a pthread_t in pthread_join doesn't work } } }