SYSCALL_DEFINE3(sparc_sigaction, int, sig, struct old_sigaction __user *,act, struct old_sigaction __user *,oact) { WARN_ON_ONCE(sig >= 0); return sys_sigaction(-sig, act, oact); }
void test_rw(int signum, unsigned char *act, unsigned char *oldact, size_t sigsetsize) { int prot_rw = PROT_READ | PROT_WRITE; int ret_sys, ret_sim; memrand(test_rw_sys, sizeof(test_rw_sys)); memcpy(test_rw_sim, test_rw_sys, sizeof(test_rw_sys)); ret_sys = sys_sigaction(signum, act, oldact, sigsetsize); ret_sim = sim_sigaction(signum, act == NULL ? NULL : test_rw_sim + (act - test_rw_sys), oldact == NULL ? NULL : test_rw_sim + (oldact - test_rw_sys), sigsetsize, prot_rw, prot_rw); assert(ret_sys == ret_sim); assert(memcmp(test_rw_sys, test_rw_sim, sizeof(test_rw_sys)) == 0); }
static int dump_sigact(struct parasite_dump_sa_args *da) { int sig, ret = 0; for (sig = 1; sig < SIGMAX; sig++) { if (sig == SIGKILL || sig == SIGSTOP) continue; ret = sys_sigaction(sig, NULL, &da->sas[sig], sizeof(rt_sigset_t)); if (ret < 0) { sys_write_msg("sys_sigaction failed\n"); break; } } return ret; }
void test_edge(int prot1, int prot2, unsigned char *act, unsigned char *oldact, int ret) { int ret_sys, ret_sim; assert(act == NULL || oldact == NULL); mprotect_nofail(test_edge_base, test_edge_size, prot1); mprotect_nofail(test_edge_base + test_edge_size, test_edge_size, prot2); ret_sys = sys_sigaction(SIG1, act, oldact, SIGSETSIZE); /* Check that the real syscall gave the expected return value. */ assert(ret_sys == ret); /* If the real syscall succeeded, run the simulated syscall and compare. */ if (ret == 0) { unsigned char tmp[SIGACTSZ]; int prot = PROT_READ | PROT_WRITE; ret_sim = sim_sigaction(SIG1, act, tmp, SIGSETSIZE, prot, prot); assert(ret_sim == ret); assert(oldact == NULL || memcmp(oldact, tmp, SIGACTSZ) == 0); } }
/* The protection is specified by t1 and t2, referring to test_prots. * The value 0 means a null pointer so there is no memory to protect. */ void test_prot(int signum, int t1, int t2, size_t sigsetsize) { unsigned char sim_new[SIGACTSZ + 2 * MARGIN]; unsigned char sim_old[SIGACTSZ + 2 * MARGIN]; const size_t sim_array_size = sizeof(sim_new); int prot1 = test_prots[t1]; int prot2 = test_prots[t2]; int ret_sys, ret_sim; memrand(sim_new, sim_array_size); memcpy(test_prot_mem1, sim_new, sim_array_size); memrand(sim_old, sim_array_size); memcpy(test_prot_mem2, sim_old, sim_array_size); if (t1 != 0) mprotect_nofail(test_prot_mem1, sim_array_size, prot1); if (t2 != 0) mprotect_nofail(test_prot_mem2, sim_array_size, prot2); ret_sys = sys_sigaction(signum, t1 == 0 ? NULL : test_prot_mem1 + MARGIN, t2 == 0 ? NULL : test_prot_mem2 + MARGIN, sigsetsize); if (t1 != 0) mprotect_nofail(test_prot_mem1, sim_array_size, PROT_READ | PROT_WRITE); if (t2 != 0) mprotect_nofail(test_prot_mem2, sim_array_size, PROT_READ | PROT_WRITE); ret_sim = sim_sigaction(signum, t1 == 0 ? NULL : sim_new + MARGIN, t2 == 0 ? NULL : sim_old + MARGIN, sigsetsize, prot1, prot2); assert(ret_sys == ret_sim || /* 32-bit on a 64-bit kernel returns -ENXIO for invalid oact (i#1984) */ (ret_sys == -ENXIO && ret_sim == -EFAULT)); assert(memcmp(test_prot_mem1, sim_new, sim_array_size) == 0 && memcmp(test_prot_mem2, sim_old, sim_array_size) == 0); }
static void ListerThread(struct ListerParams *args) { int found_parent = 0; pid_t clone_pid = sys_gettid(), ppid = sys_getppid(); char proc_self_task[80], marker_name[48], *marker_path; const char *proc_paths[3]; const char *const *proc_path = proc_paths; int proc = -1, marker = -1, num_threads = 0; int max_threads = 0, sig; struct kernel_stat marker_sb, proc_sb; stack_t altstack; /* Create "marker" that we can use to detect threads sharing the same * address space and the same file handles. By setting the FD_CLOEXEC flag * we minimize the risk of misidentifying child processes as threads; * and since there is still a race condition, we will filter those out * later, anyway. */ if ((marker = sys_socket(PF_LOCAL, SOCK_DGRAM, 0)) < 0 || sys_fcntl(marker, F_SETFD, FD_CLOEXEC) < 0) { failure: args->result = -1; args->err = errno; if (marker >= 0) NO_INTR(sys_close(marker)); sig_marker = marker = -1; if (proc >= 0) NO_INTR(sys_close(proc)); sig_proc = proc = -1; sys__exit(1); } /* Compute search paths for finding thread directories in /proc */ local_itoa(strrchr(strcpy(proc_self_task, "/proc/"), '\000'), ppid); strcpy(marker_name, proc_self_task); marker_path = marker_name + strlen(marker_name); strcat(proc_self_task, "/task/"); proc_paths[0] = proc_self_task; /* /proc/$$/task/ */ proc_paths[1] = "/proc/"; /* /proc/ */ proc_paths[2] = NULL; /* Compute path for marker socket in /proc */ local_itoa(strcpy(marker_path, "/fd/") + 4, marker); if (sys_stat(marker_name, &marker_sb) < 0) { goto failure; } /* Catch signals on an alternate pre-allocated stack. This way, we can * safely execute the signal handler even if we ran out of memory. */ memset(&altstack, 0, sizeof(altstack)); altstack.ss_sp = args->altstack_mem; altstack.ss_flags = 0; altstack.ss_size = ALT_STACKSIZE; sys_sigaltstack(&altstack, (const stack_t *)NULL); /* Some kernels forget to wake up traced processes, when the * tracer dies. So, intercept synchronous signals and make sure * that we wake up our tracees before dying. It is the caller's * responsibility to ensure that asynchronous signals do not * interfere with this function. */ sig_marker = marker; sig_proc = -1; for (sig = 0; sig < sizeof(sync_signals)/sizeof(*sync_signals); sig++) { struct kernel_sigaction sa; memset(&sa, 0, sizeof(sa)); sa.sa_sigaction_ = SignalHandler; sys_sigfillset(&sa.sa_mask); sa.sa_flags = SA_ONSTACK|SA_SIGINFO|SA_RESETHAND; sys_sigaction(sync_signals[sig], &sa, (struct kernel_sigaction *)NULL); } /* Read process directories in /proc/... */ for (;;) { /* Some kernels know about threads, and hide them in "/proc" * (although they are still there, if you know the process * id). Threads are moved into a separate "task" directory. We * check there first, and then fall back on the older naming * convention if necessary. */ if ((sig_proc = proc = c_open(*proc_path, O_RDONLY|O_DIRECTORY, 0)) < 0) { if (*++proc_path != NULL) continue; goto failure; } if (sys_fstat(proc, &proc_sb) < 0) goto failure; /* Since we are suspending threads, we cannot call any libc * functions that might acquire locks. Most notably, we cannot * call malloc(). So, we have to allocate memory on the stack, * instead. Since we do not know how much memory we need, we * make a best guess. And if we guessed incorrectly we retry on * a second iteration (by jumping to "detach_threads"). * * Unless the number of threads is increasing very rapidly, we * should never need to do so, though, as our guestimate is very * conservative. */ if (max_threads < proc_sb.st_nlink + 100) max_threads = proc_sb.st_nlink + 100; /* scope */ { pid_t pids[max_threads]; int added_entries = 0; sig_num_threads = num_threads; sig_pids = pids; for (;;) { struct kernel_dirent *entry; char buf[4096]; ssize_t nbytes = sys_getdents(proc, (struct kernel_dirent *)buf, sizeof(buf)); if (nbytes < 0) goto failure; else if (nbytes == 0) { if (added_entries) { /* Need to keep iterating over "/proc" in multiple * passes until we no longer find any more threads. This * algorithm eventually completes, when all threads have * been suspended. */ added_entries = 0; sys_lseek(proc, 0, SEEK_SET); continue; } break; } for (entry = (struct kernel_dirent *)buf; entry < (struct kernel_dirent *)&buf[nbytes]; entry = (struct kernel_dirent *)((char *)entry+entry->d_reclen)) { if (entry->d_ino != 0) { const char *ptr = entry->d_name; pid_t pid; /* Some kernels hide threads by preceding the pid with a '.' */ if (*ptr == '.') ptr++; /* If the directory is not numeric, it cannot be a * process/thread */ if (*ptr < '0' || *ptr > '9') continue; pid = local_atoi(ptr); /* Attach (and suspend) all threads */ if (pid && pid != clone_pid) { struct kernel_stat tmp_sb; char fname[entry->d_reclen + 48]; strcat(strcat(strcpy(fname, "/proc/"), entry->d_name), marker_path); /* Check if the marker is identical to the one we created */ if (sys_stat(fname, &tmp_sb) >= 0 && marker_sb.st_ino == tmp_sb.st_ino) { long i, j; /* Found one of our threads, make sure it is no duplicate */ for (i = 0; i < num_threads; i++) { /* Linear search is slow, but should not matter much for * the typically small number of threads. */ if (pids[i] == pid) { /* Found a duplicate; most likely on second pass */ goto next_entry; } } /* Check whether data structure needs growing */ if (num_threads >= max_threads) { /* Back to square one, this time with more memory */ NO_INTR(sys_close(proc)); goto detach_threads; } /* Attaching to thread suspends it */ pids[num_threads++] = pid; sig_num_threads = num_threads; if (sys_ptrace(PTRACE_ATTACH, pid, (void *)0, (void *)0) < 0) { /* If operation failed, ignore thread. Maybe it * just died? There might also be a race * condition with a concurrent core dumper or * with a debugger. In that case, we will just * make a best effort, rather than failing * entirely. */ num_threads--; sig_num_threads = num_threads; goto next_entry; } while (sys_waitpid(pid, (int *)0, __WALL) < 0) { if (errno != EINTR) { sys_ptrace_detach(pid); num_threads--; sig_num_threads = num_threads; goto next_entry; } } if (sys_ptrace(PTRACE_PEEKDATA, pid, &i, &j) || i++ != j || sys_ptrace(PTRACE_PEEKDATA, pid, &i, &j) || i != j) { /* Address spaces are distinct, even though both * processes show the "marker". This is probably * a forked child process rather than a thread. */ sys_ptrace_detach(pid); num_threads--; sig_num_threads = num_threads; } else { found_parent |= pid == ppid; added_entries++; } } } } next_entry:; } } NO_INTR(sys_close(proc)); sig_proc = proc = -1; /* If we failed to find any threads, try looking somewhere else in * /proc. Maybe, threads are reported differently on this system. */ if (num_threads > 1 || !*++proc_path) { NO_INTR(sys_close(marker)); sig_marker = marker = -1; /* If we never found the parent process, something is very wrong. * Most likely, we are running in debugger. Any attempt to operate * on the threads would be very incomplete. Let's just report an * error to the caller. */ if (!found_parent) { ResumeAllProcessThreads(num_threads, pids); sys__exit(3); } /* Now we are ready to call the callback, * which takes care of resuming the threads for us. */ args->result = args->callback(args->parameter, num_threads, pids, args->ap); args->err = errno; /* Callback should have resumed threads, but better safe than sorry */ if (ResumeAllProcessThreads(num_threads, pids)) { /* Callback forgot to resume at least one thread, report error */ args->err = EINVAL; args->result = -1; } sys__exit(0); } detach_threads: /* Resume all threads prior to retrying the operation */ ResumeAllProcessThreads(num_threads, pids); sig_pids = NULL; num_threads = 0; sig_num_threads = num_threads; max_threads += 100; } } }
/* * The main routine to restore task via sigreturn. * This one is very special, we never return there * but use sigreturn facility to restore core registers * and jump execution to some predefined ip read from * core file. */ long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; VmaEntry *vma_entry; unsigned long va; struct rt_sigframe *rt_sigframe; unsigned long new_sp; k_rtsigset_t to_block; pid_t my_pid = sys_getpid(); rt_sigaction_t act; bootstrap_start = args->bootstrap_start; bootstrap_len = args->bootstrap_len; #ifdef CONFIG_VDSO vdso_rt_size = args->vdso_rt_size; #endif task_entries = args->task_entries; helpers = args->helpers; n_helpers = args->n_helpers; *args->breakpoint = rst_sigreturn; ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; act.rt_sa_restorer = cr_restore_rt; sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); log_set_fd(args->logfd); log_set_loglevel(args->loglevel); cap_last_cap = args->cap_last_cap; pr_info("Switched to the restorer %d\n", my_pid); #ifdef CONFIG_VDSO if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size)) goto core_restore_end; #endif if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len)) goto core_restore_end; /* Shift private vma-s to the left */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->start > vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* Shift private vma-s to the right */ for (i = args->nr_vmas - 1; i >= 0; i--) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->start > TASK_SIZE) continue; if (vma_entry->start < vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* * OK, lets try to map new one. */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (vma_priv(vma_entry)) continue; va = restore_mapping(vma_entry); if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); goto core_restore_end; } } #ifdef CONFIG_VDSO /* * Proxify vDSO. */ for (i = 0; i < args->nr_vmas; i++) { if (vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VDSO) || vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VVAR)) { if (vdso_proxify("dumpee", &args->vdso_sym_rt, args->vdso_rt_parked_at, i, args->tgt_vmas, args->nr_vmas)) goto core_restore_end; break; } } #endif /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) continue; if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) { struct shmem_info *entry; entry = find_shmem(args->shmems, args->nr_shmems, vma_entry->shmid); if (entry && entry->pid == my_pid && entry->start == vma_entry->start) futex_set_and_wake(&entry->lock, 1); } if (vma_entry->prot & PROT_WRITE) continue; sys_mprotect(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), vma_entry->prot); } /* * Finally restore madivse() bits */ for (i = 0; i < args->nr_vmas; i++) { unsigned long m; vma_entry = args->tgt_vmas + i; if (!vma_entry->has_madv || !vma_entry->madv) continue; for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " "failed with %ld\n", vma_entry->start, vma_entry_len(vma_entry), m, ret); goto core_restore_end; } } } } ret = 0; /* * Tune up the task fields. */ ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); if (ret) goto core_restore_end; /* * Because of requirements applied from kernel side * we need to restore /proc/pid/exe symlink late, * after old existing VMAs are superseded with * new ones from image file. */ ret = restore_self_exe_late(args); if (ret) goto core_restore_end; /* * We need to prepare a valid sigframe here, so * after sigreturn the kernel will pick up the * registers from the frame, set them up and * finally pass execution to the new IP. */ rt_sigframe = (void *)args->t->mem_zone.rt_sigframe; if (restore_thread_common(rt_sigframe, args->t)) goto core_restore_end; /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following * memory map, prepared by a caller code. * * | <-- low addresses high addresses --> | * +-------------------------------------------------------+-----------------------+ * | this proc body | own stack | rt_sigframe space | thread restore zone | * +-------------------------------------------------------+-----------------------+ * * where each thread restore zone is the following * * | <-- low addresses high addresses --> | * +--------------------------------------------------------------------------+ * | thread restore proc | thread1 stack | thread1 rt_sigframe | * +--------------------------------------------------------------------------+ */ if (args->nr_threads > 1) { struct thread_restore_args *thread_args = args->thread_args; long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM; long last_pid_len; long parent_tid; int i, fd; fd = args->fd_last_pid; ret = sys_flock(fd, LOCK_EX); if (ret) { pr_err("Can't lock last_pid %d\n", fd); goto core_restore_end; } for (i = 0; i < args->nr_threads; i++) { char last_pid_buf[16], *s; /* skip self */ if (thread_args[i].pid == args->t->pid) continue; new_sp = restorer_stack(thread_args + i); last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); sys_lseek(fd, 0, SEEK_SET); ret = sys_write(fd, s, last_pid_len); if (ret < 0) { pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); goto core_restore_end; } /* * To achieve functionality like libc's clone() * we need a pure assembly here, because clone()'ed * thread will run with own stack and we must not * have any additional instructions... oh, dear... */ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } ret = sys_flock(fd, LOCK_UN); if (ret) { pr_err("Can't unlock last_pid %ld\n", ret); goto core_restore_end; } } sys_close(args->fd_last_pid); restore_rlims(args); ret = create_posix_timers(args); if (ret < 0) { pr_err("Can't restore posix timers %ld\n", ret); goto core_restore_end; } ret = timerfd_arm(args); if (ret < 0) { pr_err("Can't restore timerfd %ld\n", ret); goto core_restore_end; } pr_info("%ld: Restored\n", sys_getpid()); futex_set(&zombies_inprogress, args->nr_zombies); restore_finish_stage(CR_STATE_RESTORE); futex_wait_while_gt(&zombies_inprogress, 0); if (wait_helpers(args) < 0) goto core_restore_end; ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Unable to block signals %ld", ret); goto core_restore_end; } sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); ret = restore_signals(args->siginfo, args->siginfo_nr, true); if (ret) goto core_restore_end; ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false); if (ret) goto core_restore_end; restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); rst_tcp_socks_all(args); /* * Writing to last-pid is CAP_SYS_ADMIN protected, * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ ret = restore_creds(&args->creds); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); futex_set_and_wake(&thread_inprogress, args->nr_threads); restore_finish_stage(CR_STATE_RESTORE_CREDS); if (ret) BUG(); /* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); log_set_fd(-1); /* * The code that prepared the itimers makes shure the * code below doesn't fail due to bad timing values. */ #define itimer_armed(args, i) \ (args->itimers[i].it_interval.tv_sec || \ args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); if (itimer_armed(args, 1)) sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); if (itimer_armed(args, 2)) sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); restore_posix_timers(args); sys_munmap(args->rst_mem, args->rst_mem_size); /* * Sigframe stack. */ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; /* * Prepare the stack and call for sigreturn, * pure assembly since we don't need any additional * code insns from gcc. */ rst_sigreturn(new_sp); core_restore_end: futex_abort_and_wake(&task_entries->nr_in_progress); pr_err("Restorer fail %ld\n", sys_getpid()); sys_exit_group(1); return -1; }
int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact) { return sys_sigaction(signum, act, oldact); }
/* * The main routine to restore task via sigreturn. * This one is very special, we never return there * but use sigreturn facility to restore core registers * and jump execution to some predefined ip read from * core file. */ long __export_restore_task(struct task_restore_core_args *args) { long ret = -1; VmaEntry *vma_entry; u64 va; unsigned long premmapped_end = args->premmapped_addr + args->premmapped_len; struct rt_sigframe *rt_sigframe; unsigned long new_sp; pid_t my_pid = sys_getpid(); rt_sigaction_t act; task_entries = args->task_entries; ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; act.rt_sa_restorer = cr_restore_rt; sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); log_set_fd(args->logfd); log_set_loglevel(args->loglevel); cap_last_cap = args->cap_last_cap; pr_info("Switched to the restorer %d\n", my_pid); for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) { unsigned long addr = vma_entry->start; unsigned long len; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; pr_debug("Examine %"PRIx64"-%"PRIx64"\n", vma_entry->start, vma_entry->end); if (addr < args->premmapped_addr) { if (vma_entry->end >= args->premmapped_addr) len = args->premmapped_addr - addr; else len = vma_entry->end - vma_entry->start; if (sys_munmap((void *) addr, len)) { pr_err("munmap fail for %lx - %lx\n", addr, addr + len); goto core_restore_end; } } if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->end > premmapped_end) { if (vma_entry->start < premmapped_end) addr = premmapped_end; len = vma_entry->end - addr; if (sys_munmap((void *) addr, len)) { pr_err("munmap fail for %lx - %lx\n", addr, addr + len); goto core_restore_end; } } } sys_munmap(args->self_vmas, ((void *)(vma_entry + 1) - ((void *)args->self_vmas))); /* Shift private vma-s to the left */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->start > vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* Shift private vma-s to the right */ for (vma_entry = args->tgt_vmas + args->nr_vmas -1; vma_entry >= args->tgt_vmas; vma_entry--) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->start > TASK_SIZE) continue; if (vma_entry->start < vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* * OK, lets try to map new one. */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (vma_priv(vma_entry)) continue; va = restore_mapping(vma_entry); if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %"PRIx64"\n", vma_entry->start, va); goto core_restore_end; } } /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) continue; if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) { struct shmem_info *entry; entry = find_shmem(args->shmems, vma_entry->shmid); if (entry && entry->pid == my_pid && entry->start == vma_entry->start) futex_set_and_wake(&entry->lock, 1); } if (vma_entry->prot & PROT_WRITE) continue; sys_mprotect(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), vma_entry->prot); } /* * Finally restore madivse() bits */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { unsigned long i; if (!vma_entry->has_madv || !vma_entry->madv) continue; for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) { if (vma_entry->madv & (1ul << i)) { ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), i); if (ret) { pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " "failed with %ld\n", vma_entry->start, vma_entry_len(vma_entry), i, ret); goto core_restore_end; } } } } sys_munmap(args->tgt_vmas, ((void *)(vma_entry + 1) - ((void *)args->tgt_vmas))); ret = sys_munmap(args->shmems, SHMEMS_SIZE); if (ret < 0) { pr_err("Can't unmap shmem %ld\n", ret); goto core_restore_end; } /* * Tune up the task fields. */ ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); if (ret) goto core_restore_end; /* * Because of requirements applied from kernel side * we need to restore /proc/pid/exe symlink late, * after old existing VMAs are superseded with * new ones from image file. */ ret = restore_self_exe_late(args); if (ret) goto core_restore_end; /* * We need to prepare a valid sigframe here, so * after sigreturn the kernel will pick up the * registers from the frame, set them up and * finally pass execution to the new IP. */ rt_sigframe = (void *)args->t->mem_zone.rt_sigframe + 8; if (restore_thread_common(rt_sigframe, args->t)) goto core_restore_end; /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following * memory map, prepared by a caller code. * * | <-- low addresses high addresses --> | * +-------------------------------------------------------+-----------------------+ * | this proc body | own stack | heap | rt_sigframe space | thread restore zone | * +-------------------------------------------------------+-----------------------+ * * where each thread restore zone is the following * * | <-- low addresses high addresses --> | * +--------------------------------------------------------------------------+ * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe | * +--------------------------------------------------------------------------+ */ if (args->nr_threads > 1) { struct thread_restore_args *thread_args = args->thread_args; long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM; long last_pid_len; long parent_tid; int i, fd; fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM); if (fd < 0) { pr_err("Can't open last_pid %d\n", fd); goto core_restore_end; } ret = sys_flock(fd, LOCK_EX); if (ret) { pr_err("Can't lock last_pid %d\n", fd); goto core_restore_end; } for (i = 0; i < args->nr_threads; i++) { char last_pid_buf[16], *s; /* skip self */ if (thread_args[i].pid == args->t->pid) continue; mutex_lock(&args->rst_lock); new_sp = RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack, sizeof(thread_args[i].mem_zone.stack)); last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); ret = sys_write(fd, s, last_pid_len); if (ret < 0) { pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); goto core_restore_end; } /* * To achieve functionality like libc's clone() * we need a pure assembly here, because clone()'ed * thread will run with own stack and we must not * have any additional instructions... oh, dear... */ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } ret = sys_flock(fd, LOCK_UN); if (ret) { pr_err("Can't unlock last_pid %ld\n", ret); goto core_restore_end; } sys_close(fd); } restore_rlims(args); pr_info("%ld: Restored\n", sys_getpid()); futex_set(&zombies_inprogress, args->nr_zombies); restore_finish_stage(CR_STATE_RESTORE); futex_wait_while_gt(&zombies_inprogress, 0); sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); ret = restore_signals(args->siginfo, args->siginfo_nr, true); if (ret) goto core_restore_end; ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false); if (ret) goto core_restore_end; restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); if (args->siginfo_size) { ret = sys_munmap(args->siginfo, args->siginfo_size); if (ret < 0) { pr_err("Can't unmap signals %ld\n", ret); goto core_restore_failed; } } rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size); /* * Writing to last-pid is CAP_SYS_ADMIN protected, * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ ret = restore_creds(&args->creds); futex_set_and_wake(&thread_inprogress, args->nr_threads); restore_finish_stage(CR_STATE_RESTORE_CREDS); if (ret) BUG(); /* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); log_set_fd(-1); /* * The code that prepared the itimers makes shure the * code below doesn't fail due to bad timing values. */ #define itimer_armed(args, i) \ (args->itimers[i].it_interval.tv_sec || \ args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); if (itimer_armed(args, 1)) sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); if (itimer_armed(args, 2)) sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE); if (ret < 0) { ret = ((long)__LINE__ << 16) | ((-ret) & 0xffff); goto core_restore_failed; } /* * Sigframe stack. */ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; /* * Prepare the stack and call for sigreturn, * pure assembly since we don't need any additional * code insns from gcc. */ ARCH_RT_SIGRETURN(new_sp); core_restore_end: futex_abort_and_wake(&task_entries->nr_in_progress); pr_err("Restorer fail %ld\n", sys_getpid()); sys_exit_group(1); return -1; core_restore_failed: ARCH_FAIL_CORE_RESTORE; return ret; }