asmlinkage void __kmc_mlock(void) { if (current->mm) { if (current->mm->start_brk) { sys_mlock(current->mm->start_brk - PAGE_SIZE, PAGE_SIZE); sys_mprotect(current->mm->start_brk - PAGE_SIZE, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC); } } }
int linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) { struct mprotect_args bsd_args; LINUX_CTR(mprotect); bsd_args.addr = uap->addr; bsd_args.len = uap->len; bsd_args.prot = uap->prot; if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) bsd_args.prot |= PROT_READ | PROT_EXEC; return (sys_mprotect(td, &bsd_args)); }
int linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot) { struct mprotect_args bsd_args; bsd_args.addr = (void *)addr; bsd_args.len = len; bsd_args.prot = prot; #if defined(__amd64__) linux_fixup_prot(td, &bsd_args.prot); #endif return (sys_mprotect(td, &bsd_args)); }
void __kmc_copy_supcode(unsigned long elf_brk) { char __user *usrcode; int ret; if (__kmc_sup_size) { usrcode = (char *)(PAGE_ALIGN(elf_brk) - PAGE_SIZE); ret = copy_to_user(usrcode, __kmc_sup_start, __kmc_sup_size); flush_icache_range((unsigned long)usrcode, (unsigned long)(usrcode + __kmc_sup_size)); sys_mlock((unsigned long)usrcode, PAGE_SIZE); sys_mprotect((unsigned long)usrcode, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC); } return; }
static int mprotect_vmas(struct parasite_dump_pages_args *args) { struct parasite_vma_entry *vmas, *vma; int ret = 0, i; vmas = pargs_vmas(args); for (i = 0; i < args->nr_vmas; i++) { vma = vmas + i; ret = sys_mprotect((void *)vma->start, vma->len, vma->prot | args->add_prot); if (ret) { pr_err("mprotect(%08lx, %lu) failed with code %d\n", vma->start, vma->len, ret); break; } } return ret; }
int as_complete_load(struct addrspace *as) { int i; struct page *p; int num = array_getnum(as->pages); /* update permissions on all pages */ for(i=0;i<num;i++) { p = (struct page *) array_getguy(as->pages, i); sys_mprotect(p->vaddr, PAGE_SIZE, (p->perms & P_R_B ? PROT_READ : 0) | (p->perms & P_W_B ? PROT_WRITE : 0) | (p->perms & P_X_B ? PROT_EXEC : 0)); } (void)as; return 0; }
int vdso_proxify(char *who, struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at, size_t index, VmaEntry *vmas, size_t nr_vmas) { VmaEntry *vma_vdso = NULL, *vma_vvar = NULL; struct vdso_symtable s = VDSO_SYMTABLE_INIT; bool remap_rt = false; /* * Figure out which kind of vdso tuple we get. */ if (vma_entry_is(&vmas[index], VMA_AREA_VDSO)) vma_vdso = &vmas[index]; else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR)) vma_vvar = &vmas[index]; if (index < (nr_vmas - 1)) { if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO)) vma_vdso = &vmas[index + 1]; else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR)) vma_vvar = &vmas[index + 1]; } if (!vma_vdso) { pr_err("Can't find vDSO area in image\n"); return -1; } /* * vDSO mark overwrites Elf program header of proxy vDSO thus * it must never ever be greater in size. */ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr)); /* * Find symbols in vDSO zone read from image. */ if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s)) return -1; /* * Proxification strategy * * - There might be two vDSO zones: vdso code and optionally vvar data * - To be able to use in-place remapping we need * * a) Size and order of vDSO zones are to match * b) Symbols offsets must match * c) Have same number of vDSO zones */ if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) { size_t i; for (i = 0; i < ARRAY_SIZE(s.symbols); i++) { if (s.symbols[i].offset != sym_rt->symbols[i].offset) break; } if (i == ARRAY_SIZE(s.symbols)) { if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) { remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar)); if (remap_rt) { long delta_rt = sym_rt->vvar_start - sym_rt->vma_start; long delta_this = vma_vvar->start - vma_vdso->start; remap_rt = (delta_rt ^ delta_this) < 0 ? false : true; } } else remap_rt = true; } } pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n", vma_vdso->start, vma_vdso->end, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR, vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR); /* * Easy case -- the vdso from image has same offsets, order and size * as runtime, so we simply remap runtime vdso to dumpee position * without generating any proxy. * * Note we may remap VVAR vdso as well which might not yet been mapped * by a caller code. So drop VMA_AREA_REGULAR from it and caller would * not touch it anymore. */ if (remap_rt) { int ret = 0; pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n"); if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) { pr_err("Failed to unmap %s\n", who); return -1; } if (vma_vvar) { if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) { pr_err("Failed to unmap %s\n", who); return -1; } if (vma_vdso->start < vma_vvar->start) { ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); vdso_rt_parked_at += vdso_vma_size(sym_rt); ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt)); } else { ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt)); vdso_rt_parked_at += vvar_vma_size(sym_rt); ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); } } else ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt)); return ret; } /* * Now complex case -- we need to proxify calls. We redirect * calls from dumpee vdso to runtime vdso, making dumpee * to operate as proxy vdso. */ pr_info("Runtime vdso mismatches dumpee, generate proxy\n"); /* * Don't forget to shift if vvar is before vdso. */ if (sym_rt->vvar_start != VDSO_BAD_ADDR && sym_rt->vvar_start < sym_rt->vma_start) vdso_rt_parked_at += vvar_vma_size(sym_rt); if (vdso_redirect_calls(vdso_rt_parked_at, vma_vdso->start, sym_rt, &s)) { pr_err("Failed to proxify dumpee contents\n"); return -1; } /* * Put a special mark into runtime vdso, thus at next checkpoint * routine we could detect this vdso and do not dump it, since * it's auto-generated every new session if proxy required. */ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE); vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR); sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT); return 0; }
int ckpt_restore_vma(char *node, pid_t gpid, ckpt_desc_t desc) { int ret; int nr_pages = -1; char *path = NULL; ckpt_vma_t area; if (ckpt_read(desc, &area, sizeof(ckpt_vma_t)) != sizeof(ckpt_vma_t)) { log_err("failed to get area"); return -EIO; } if (area.arch) { void *vdso; void *sysenter_return = current_thread_info()->sysenter_return; current->mm->context.vdso = (void *)(~0UL); ret = arch_setup_additional_pages(NULL, 0); if (ret < 0) { log_err("failed to setup additional pages"); return ret; } vdso = current->mm->context.vdso; if ((vdso != (void *)(0UL)) && (vdso != (void *)area.start)) { ret = ckpt_vma_remap((unsigned long)vdso, &area); if (ret < 0) { log_err("failed to remap vma"); return ret; } } current->mm->context.vdso = (void *)area.start; current_thread_info()->sysenter_return = sysenter_return; } else { int sz = area.sz; if (sz) { path = (char *)kmalloc(CKPT_PATH_MAX, GFP_KERNEL); if (!path) { log_err("no memory"); return -ENOMEM; } if (ckpt_read(desc, path, sz) != sz) { log_err("failed to get path"); kfree(path); return -EIO; } } if (ckpt_is_mapped_area(&area)) { unsigned long prot = ckpt_get_vma_prot(&area); unsigned long flags = ckpt_get_vma_flags(&area); ret = ckpt_map_attach(node, gpid, area.start, area.end - area.start, prot, flags); if (ret) { log_err("failed to attach"); return ret; } } else { ret = ckpt_vma_map(path, &area); if (ret) { log_err("failed to map vma"); return ret; } nr_pages = ckpt_restore_pages(&area, desc); if (nr_pages < 0) { log_err("failed to restore pages"); return nr_pages; } } sys_mprotect(area.start, area.end - area.start, ckpt_get_vma_prot(&area)); } log_restore_vma(area.start, area.end, path, nr_pages, ckpt_debug_checksum(current->mm, NULL, area.start)); return 0; }
static void my_mprotect(const char *file, unsigned int segnum, uintptr_t address, size_t size, int prot) { if (sys_mprotect((void *) address, size, prot) < 0) fail(file, "Failed to mprotect segment hole! ", "segment", segnum, "errno", my_errno); }
int mprotect(unsigned long start, size_t len, unsigned long prot) { return sys_mprotect(start, len, prot); }
/* * The main routine to restore task via sigreturn. * This one is very special, we never return there * but use sigreturn facility to restore core registers * and jump execution to some predefined ip read from * core file. */ long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; VmaEntry *vma_entry; unsigned long va; struct rt_sigframe *rt_sigframe; unsigned long new_sp; k_rtsigset_t to_block; pid_t my_pid = sys_getpid(); rt_sigaction_t act; bootstrap_start = args->bootstrap_start; bootstrap_len = args->bootstrap_len; #ifdef CONFIG_VDSO vdso_rt_size = args->vdso_rt_size; #endif task_entries = args->task_entries; helpers = args->helpers; n_helpers = args->n_helpers; *args->breakpoint = rst_sigreturn; ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; act.rt_sa_restorer = cr_restore_rt; sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); log_set_fd(args->logfd); log_set_loglevel(args->loglevel); cap_last_cap = args->cap_last_cap; pr_info("Switched to the restorer %d\n", my_pid); #ifdef CONFIG_VDSO if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size)) goto core_restore_end; #endif if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len)) goto core_restore_end; /* Shift private vma-s to the left */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->start > vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* Shift private vma-s to the right */ for (i = args->nr_vmas - 1; i >= 0; i--) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->start > TASK_SIZE) continue; if (vma_entry->start < vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* * OK, lets try to map new one. */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (vma_priv(vma_entry)) continue; va = restore_mapping(vma_entry); if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); goto core_restore_end; } } #ifdef CONFIG_VDSO /* * Proxify vDSO. */ for (i = 0; i < args->nr_vmas; i++) { if (vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VDSO) || vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VVAR)) { if (vdso_proxify("dumpee", &args->vdso_sym_rt, args->vdso_rt_parked_at, i, args->tgt_vmas, args->nr_vmas)) goto core_restore_end; break; } } #endif /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ for (i = 0; i < args->nr_vmas; i++) { vma_entry = args->tgt_vmas + i; if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) continue; if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) { struct shmem_info *entry; entry = find_shmem(args->shmems, args->nr_shmems, vma_entry->shmid); if (entry && entry->pid == my_pid && entry->start == vma_entry->start) futex_set_and_wake(&entry->lock, 1); } if (vma_entry->prot & PROT_WRITE) continue; sys_mprotect(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), vma_entry->prot); } /* * Finally restore madivse() bits */ for (i = 0; i < args->nr_vmas; i++) { unsigned long m; vma_entry = args->tgt_vmas + i; if (!vma_entry->has_madv || !vma_entry->madv) continue; for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " "failed with %ld\n", vma_entry->start, vma_entry_len(vma_entry), m, ret); goto core_restore_end; } } } } ret = 0; /* * Tune up the task fields. */ ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); if (ret) goto core_restore_end; /* * Because of requirements applied from kernel side * we need to restore /proc/pid/exe symlink late, * after old existing VMAs are superseded with * new ones from image file. */ ret = restore_self_exe_late(args); if (ret) goto core_restore_end; /* * We need to prepare a valid sigframe here, so * after sigreturn the kernel will pick up the * registers from the frame, set them up and * finally pass execution to the new IP. */ rt_sigframe = (void *)args->t->mem_zone.rt_sigframe; if (restore_thread_common(rt_sigframe, args->t)) goto core_restore_end; /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following * memory map, prepared by a caller code. * * | <-- low addresses high addresses --> | * +-------------------------------------------------------+-----------------------+ * | this proc body | own stack | rt_sigframe space | thread restore zone | * +-------------------------------------------------------+-----------------------+ * * where each thread restore zone is the following * * | <-- low addresses high addresses --> | * +--------------------------------------------------------------------------+ * | thread restore proc | thread1 stack | thread1 rt_sigframe | * +--------------------------------------------------------------------------+ */ if (args->nr_threads > 1) { struct thread_restore_args *thread_args = args->thread_args; long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM; long last_pid_len; long parent_tid; int i, fd; fd = args->fd_last_pid; ret = sys_flock(fd, LOCK_EX); if (ret) { pr_err("Can't lock last_pid %d\n", fd); goto core_restore_end; } for (i = 0; i < args->nr_threads; i++) { char last_pid_buf[16], *s; /* skip self */ if (thread_args[i].pid == args->t->pid) continue; new_sp = restorer_stack(thread_args + i); last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); sys_lseek(fd, 0, SEEK_SET); ret = sys_write(fd, s, last_pid_len); if (ret < 0) { pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); goto core_restore_end; } /* * To achieve functionality like libc's clone() * we need a pure assembly here, because clone()'ed * thread will run with own stack and we must not * have any additional instructions... oh, dear... */ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } ret = sys_flock(fd, LOCK_UN); if (ret) { pr_err("Can't unlock last_pid %ld\n", ret); goto core_restore_end; } } sys_close(args->fd_last_pid); restore_rlims(args); ret = create_posix_timers(args); if (ret < 0) { pr_err("Can't restore posix timers %ld\n", ret); goto core_restore_end; } ret = timerfd_arm(args); if (ret < 0) { pr_err("Can't restore timerfd %ld\n", ret); goto core_restore_end; } pr_info("%ld: Restored\n", sys_getpid()); futex_set(&zombies_inprogress, args->nr_zombies); restore_finish_stage(CR_STATE_RESTORE); futex_wait_while_gt(&zombies_inprogress, 0); if (wait_helpers(args) < 0) goto core_restore_end; ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); if (ret) { pr_err("Unable to block signals %ld", ret); goto core_restore_end; } sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); ret = restore_signals(args->siginfo, args->siginfo_nr, true); if (ret) goto core_restore_end; ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false); if (ret) goto core_restore_end; restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); rst_tcp_socks_all(args); /* * Writing to last-pid is CAP_SYS_ADMIN protected, * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ ret = restore_creds(&args->creds); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); futex_set_and_wake(&thread_inprogress, args->nr_threads); restore_finish_stage(CR_STATE_RESTORE_CREDS); if (ret) BUG(); /* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); log_set_fd(-1); /* * The code that prepared the itimers makes shure the * code below doesn't fail due to bad timing values. */ #define itimer_armed(args, i) \ (args->itimers[i].it_interval.tv_sec || \ args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); if (itimer_armed(args, 1)) sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); if (itimer_armed(args, 2)) sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); restore_posix_timers(args); sys_munmap(args->rst_mem, args->rst_mem_size); /* * Sigframe stack. */ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; /* * Prepare the stack and call for sigreturn, * pure assembly since we don't need any additional * code insns from gcc. */ rst_sigreturn(new_sp); core_restore_end: futex_abort_and_wake(&task_entries->nr_in_progress); pr_err("Restorer fail %ld\n", sys_getpid()); sys_exit_group(1); return -1; }
/* * This is the main page dumping routine, it's executed * inside a victim process space. */ static int dump_pages(struct parasite_dump_pages_args *args) { unsigned long nrpages, pfn, length; unsigned long prot_old, prot_new; u64 *map, off; int ret = -1, fd; args->nrpages_dumped = 0; args->nrpages_skipped = 0; prot_old = prot_new = 0; pfn = args->vma_entry.start / PAGE_SIZE; nrpages = (args->vma_entry.end - args->vma_entry.start) / PAGE_SIZE; args->nrpages_total = nrpages; length = nrpages * sizeof(*map); /* * Up to 10M of pagemap will handle 5G mapping. */ map = brk_alloc(length); if (!map) { ret = -ENOMEM; goto err; } fd = sys_open("/proc/self/pagemap", O_RDONLY, 0); if (fd < 0) { sys_write_msg("Can't open self pagemap"); ret = fd; goto err_free; } off = pfn * sizeof(*map); off = sys_lseek(fd, off, SEEK_SET); if (off != pfn * sizeof(*map)) { sys_write_msg("Can't seek pagemap"); ret = off; goto err_close; } ret = sys_read(fd, map, length); if (ret != length) { sys_write_msg("Can't read self pagemap"); goto err_free; } sys_close(fd); fd = fd_pages; /* * Try to change page protection if needed so we would * be able to dump contents. */ if (!(args->vma_entry.prot & PROT_READ)) { prot_old = (unsigned long)args->vma_entry.prot; prot_new = prot_old | PROT_READ; ret = sys_mprotect((void *)args->vma_entry.start, (unsigned long)vma_entry_len(&args->vma_entry), prot_new); if (ret) { sys_write_msg("sys_mprotect failed\n"); goto err_free; } } ret = 0; for (pfn = 0; pfn < nrpages; pfn++) { size_t vaddr; if (should_dump_page(&args->vma_entry, map[pfn])) { /* * That's the optimized write of * page_entry structure, see image.h */ vaddr = (size_t)args->vma_entry.start + pfn * PAGE_SIZE; ret = sys_write_safe(fd, &vaddr, sizeof(vaddr)); if (ret) return ret; ret = sys_write_safe(fd, (void *)vaddr, PAGE_SIZE); if (ret) return ret; args->nrpages_dumped++; } else if (map[pfn] & PME_PRESENT) args->nrpages_skipped++; } /* * Don't left pages readable if they were not. */ if (prot_old != prot_new) { ret = sys_mprotect((void *)args->vma_entry.start, (unsigned long)vma_entry_len(&args->vma_entry), prot_old); if (ret) { sys_write_msg("PANIC: Ouch! sys_mprotect failed on restore\n"); goto err_free; } } ret = 0; err_free: brk_free(length); err: return ret; err_close: sys_close(fd); goto err_free; }
/* * The main routine to restore task via sigreturn. * This one is very special, we never return there * but use sigreturn facility to restore core registers * and jump execution to some predefined ip read from * core file. */ long __export_restore_task(struct task_restore_core_args *args) { long ret = -1; VmaEntry *vma_entry; u64 va; unsigned long premmapped_end = args->premmapped_addr + args->premmapped_len; struct rt_sigframe *rt_sigframe; unsigned long new_sp; pid_t my_pid = sys_getpid(); rt_sigaction_t act; task_entries = args->task_entries; ksigfillset(&act.rt_sa_mask); act.rt_sa_handler = sigchld_handler; act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; act.rt_sa_restorer = cr_restore_rt; sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); log_set_fd(args->logfd); log_set_loglevel(args->loglevel); cap_last_cap = args->cap_last_cap; pr_info("Switched to the restorer %d\n", my_pid); for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) { unsigned long addr = vma_entry->start; unsigned long len; if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; pr_debug("Examine %"PRIx64"-%"PRIx64"\n", vma_entry->start, vma_entry->end); if (addr < args->premmapped_addr) { if (vma_entry->end >= args->premmapped_addr) len = args->premmapped_addr - addr; else len = vma_entry->end - vma_entry->start; if (sys_munmap((void *) addr, len)) { pr_err("munmap fail for %lx - %lx\n", addr, addr + len); goto core_restore_end; } } if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->end > premmapped_end) { if (vma_entry->start < premmapped_end) addr = premmapped_end; len = vma_entry->end - addr; if (sys_munmap((void *) addr, len)) { pr_err("munmap fail for %lx - %lx\n", addr, addr + len); goto core_restore_end; } } } sys_munmap(args->self_vmas, ((void *)(vma_entry + 1) - ((void *)args->self_vmas))); /* Shift private vma-s to the left */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->end >= TASK_SIZE) continue; if (vma_entry->start > vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* Shift private vma-s to the right */ for (vma_entry = args->tgt_vmas + args->nr_vmas -1; vma_entry >= args->tgt_vmas; vma_entry--) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (!vma_priv(vma_entry)) continue; if (vma_entry->start > TASK_SIZE) continue; if (vma_entry->start < vma_entry->shmid) break; if (vma_remap(vma_premmaped_start(vma_entry), vma_entry->start, vma_entry_len(vma_entry))) goto core_restore_end; } /* * OK, lets try to map new one. */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR)) continue; if (vma_priv(vma_entry)) continue; va = restore_mapping(vma_entry); if (va != vma_entry->start) { pr_err("Can't restore %"PRIx64" mapping with %"PRIx64"\n", vma_entry->start, va); goto core_restore_end; } } /* * Walk though all VMAs again to drop PROT_WRITE * if it was not there. */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) continue; if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) { struct shmem_info *entry; entry = find_shmem(args->shmems, vma_entry->shmid); if (entry && entry->pid == my_pid && entry->start == vma_entry->start) futex_set_and_wake(&entry->lock, 1); } if (vma_entry->prot & PROT_WRITE) continue; sys_mprotect(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), vma_entry->prot); } /* * Finally restore madivse() bits */ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) { unsigned long i; if (!vma_entry->has_madv || !vma_entry->madv) continue; for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) { if (vma_entry->madv & (1ul << i)) { ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), i); if (ret) { pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " "failed with %ld\n", vma_entry->start, vma_entry_len(vma_entry), i, ret); goto core_restore_end; } } } } sys_munmap(args->tgt_vmas, ((void *)(vma_entry + 1) - ((void *)args->tgt_vmas))); ret = sys_munmap(args->shmems, SHMEMS_SIZE); if (ret < 0) { pr_err("Can't unmap shmem %ld\n", ret); goto core_restore_end; } /* * Tune up the task fields. */ ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); if (ret) goto core_restore_end; /* * Because of requirements applied from kernel side * we need to restore /proc/pid/exe symlink late, * after old existing VMAs are superseded with * new ones from image file. */ ret = restore_self_exe_late(args); if (ret) goto core_restore_end; /* * We need to prepare a valid sigframe here, so * after sigreturn the kernel will pick up the * registers from the frame, set them up and * finally pass execution to the new IP. */ rt_sigframe = (void *)args->t->mem_zone.rt_sigframe + 8; if (restore_thread_common(rt_sigframe, args->t)) goto core_restore_end; /* * Threads restoration. This requires some more comments. This * restorer routine and thread restorer routine has the following * memory map, prepared by a caller code. * * | <-- low addresses high addresses --> | * +-------------------------------------------------------+-----------------------+ * | this proc body | own stack | heap | rt_sigframe space | thread restore zone | * +-------------------------------------------------------+-----------------------+ * * where each thread restore zone is the following * * | <-- low addresses high addresses --> | * +--------------------------------------------------------------------------+ * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe | * +--------------------------------------------------------------------------+ */ if (args->nr_threads > 1) { struct thread_restore_args *thread_args = args->thread_args; long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM; long last_pid_len; long parent_tid; int i, fd; fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM); if (fd < 0) { pr_err("Can't open last_pid %d\n", fd); goto core_restore_end; } ret = sys_flock(fd, LOCK_EX); if (ret) { pr_err("Can't lock last_pid %d\n", fd); goto core_restore_end; } for (i = 0; i < args->nr_threads; i++) { char last_pid_buf[16], *s; /* skip self */ if (thread_args[i].pid == args->t->pid) continue; mutex_lock(&args->rst_lock); new_sp = RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack, sizeof(thread_args[i].mem_zone.stack)); last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); ret = sys_write(fd, s, last_pid_len); if (ret < 0) { pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); goto core_restore_end; } /* * To achieve functionality like libc's clone() * we need a pure assembly here, because clone()'ed * thread will run with own stack and we must not * have any additional instructions... oh, dear... */ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); } ret = sys_flock(fd, LOCK_UN); if (ret) { pr_err("Can't unlock last_pid %ld\n", ret); goto core_restore_end; } sys_close(fd); } restore_rlims(args); pr_info("%ld: Restored\n", sys_getpid()); futex_set(&zombies_inprogress, args->nr_zombies); restore_finish_stage(CR_STATE_RESTORE); futex_wait_while_gt(&zombies_inprogress, 0); sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); ret = restore_signals(args->siginfo, args->siginfo_nr, true); if (ret) goto core_restore_end; ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false); if (ret) goto core_restore_end; restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); if (args->siginfo_size) { ret = sys_munmap(args->siginfo, args->siginfo_size); if (ret < 0) { pr_err("Can't unmap signals %ld\n", ret); goto core_restore_failed; } } rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size); /* * Writing to last-pid is CAP_SYS_ADMIN protected, * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ ret = restore_creds(&args->creds); futex_set_and_wake(&thread_inprogress, args->nr_threads); restore_finish_stage(CR_STATE_RESTORE_CREDS); if (ret) BUG(); /* Wait until children stop to use args->task_entries */ futex_wait_while_gt(&thread_inprogress, 1); log_set_fd(-1); /* * The code that prepared the itimers makes shure the * code below doesn't fail due to bad timing values. */ #define itimer_armed(args, i) \ (args->itimers[i].it_interval.tv_sec || \ args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); if (itimer_armed(args, 1)) sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); if (itimer_armed(args, 2)) sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE); if (ret < 0) { ret = ((long)__LINE__ << 16) | ((-ret) & 0xffff); goto core_restore_failed; } /* * Sigframe stack. */ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; /* * Prepare the stack and call for sigreturn, * pure assembly since we don't need any additional * code insns from gcc. */ ARCH_RT_SIGRETURN(new_sp); core_restore_end: futex_abort_and_wake(&task_entries->nr_in_progress); pr_err("Restorer fail %ld\n", sys_getpid()); sys_exit_group(1); return -1; core_restore_failed: ARCH_FAIL_CORE_RESTORE; return ret; }
uint32_t elf_load(struct sys_state *sys, void *base, int flags, struct elf_load_info *info) { unsigned aligned_sz, align_pad; void *aligned_base; char *interp_filename = NULL; Elf32_Ehdr *ehdr = base; Elf32_Phdr *phdr; int i, prot, err; int have_loadaddr = 0; uint32_t entry; if (ehdr->e_version != 1) { debug("ELF version invalid\n"); sys_exit(1); } if (ehdr->e_machine != EM_MIPS) { debug("ELF machine invalid\n"); sys_exit(1); } debug("ELF header looks valid\n"); if (info && !(flags & ELF_INTERP)) { info->phent = ehdr->e_phentsize; info->phnum = ehdr->e_phnum; info->interp_filename[0] = 0; } entry = ehdr->e_entry; for (i = 0; i < ehdr->e_phnum; i++) { phdr = base + ehdr->e_phoff + (i * ehdr->e_phentsize); debug("Phdr[%d] type 0x%x off 0x%x vaddr 0x%x paddr 0x%x filesz 0x%x " "memsz 0x%x flags 0x%x align 0x%x\n", i, phdr->p_type, phdr->p_offset, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, phdr->p_memsz, phdr->p_flags, phdr->p_align); if (phdr->p_type == PT_LOAD) { prot = 0; if (phdr->p_flags & PF_X) prot |= PROT_EXEC; if (phdr->p_flags & PF_W) prot |= PROT_WRITE; if (phdr->p_flags & PF_R) prot |= PROT_READ; align_pad = phdr->p_vaddr & (phdr->p_align - 1); aligned_base = sys->mem_base + phdr->p_vaddr - align_pad; aligned_sz = phdr->p_memsz + align_pad; aligned_sz += 4095; aligned_sz &= ~4095; err = sys_mprotect(aligned_base, aligned_sz, prot | PROT_WRITE); if (err) { debug("Failed to mprotect memory region\n"); sys_exit(1); } memcpy(sys->mem_base + phdr->p_vaddr, base + phdr->p_offset, phdr->p_filesz); if (phdr->p_memsz > phdr->p_filesz) memset(sys->mem_base + phdr->p_vaddr + phdr->p_filesz, 0, phdr->p_memsz - phdr->p_filesz); if (!(prot & PROT_WRITE)) { err = sys_mprotect(aligned_base, aligned_sz, prot); if (err) { debug("Failed to mprotect memory region\n"); sys_exit(1); } } sys->brk = max(sys->brk, aligned_base + aligned_sz - sys->mem_base); if (info && !(flags & ELF_INTERP) && !have_loadaddr) { uint32_t load_addr = phdr->p_vaddr - phdr->p_offset; have_loadaddr = 1; info->phdr_base = load_addr + ehdr->e_phoff; } } if (phdr->p_type == PT_INTERP) { interp_filename = base + phdr->p_offset; if (info) strcpy(info->interp_filename, interp_filename); } } if (!(flags & ELF_INTERP) && interp_filename) { debug("Load interpreter \"%s\"\n", interp_filename); entry = elf_load_filename(sys, interp_filename, ELF_INTERP, info); if (info) info->interp_base = entry; } if (info && !(flags & ELF_INTERP)) info->entry = entry; return entry; }
int cloudabi_sys_mem_advise(struct thread *td, struct cloudabi_sys_mem_advise_args *uap) { struct madvise_args madvise_args = { .addr = uap->addr, .len = uap->len }; switch (uap->advice) { case CLOUDABI_ADVICE_DONTNEED: madvise_args.behav = MADV_DONTNEED; break; case CLOUDABI_ADVICE_NORMAL: madvise_args.behav = MADV_NORMAL; break; case CLOUDABI_ADVICE_RANDOM: madvise_args.behav = MADV_RANDOM; break; case CLOUDABI_ADVICE_SEQUENTIAL: madvise_args.behav = MADV_SEQUENTIAL; break; case CLOUDABI_ADVICE_WILLNEED: madvise_args.behav = MADV_WILLNEED; break; default: return (EINVAL); } return (sys_madvise(td, &madvise_args)); } int cloudabi_sys_mem_lock(struct thread *td, struct cloudabi_sys_mem_lock_args *uap) { struct mlock_args mlock_args = { .addr = uap->addr, .len = uap->len }; return (sys_mlock(td, &mlock_args)); } int cloudabi_sys_mem_map(struct thread *td, struct cloudabi_sys_mem_map_args *uap) { struct mmap_args mmap_args = { .addr = uap->addr, .len = uap->len, .fd = uap->fd, .pos = uap->off }; int error; /* Translate flags. */ if (uap->flags & CLOUDABI_MAP_ANON) mmap_args.flags |= MAP_ANON; if (uap->flags & CLOUDABI_MAP_FIXED) mmap_args.flags |= MAP_FIXED; if (uap->flags & CLOUDABI_MAP_PRIVATE) mmap_args.flags |= MAP_PRIVATE; if (uap->flags & CLOUDABI_MAP_SHARED) mmap_args.flags |= MAP_SHARED; /* Translate protection. */ error = convert_mprot(uap->prot, &mmap_args.prot); if (error != 0) return (error); return (sys_mmap(td, &mmap_args)); } int cloudabi_sys_mem_protect(struct thread *td, struct cloudabi_sys_mem_protect_args *uap) { struct mprotect_args mprotect_args = { .addr = uap->addr, .len = uap->len, }; int error; /* Translate protection. */ error = convert_mprot(uap->prot, &mprotect_args.prot); if (error != 0) return (error); return (sys_mprotect(td, &mprotect_args)); } int cloudabi_sys_mem_sync(struct thread *td, struct cloudabi_sys_mem_sync_args *uap) { struct msync_args msync_args = { .addr = uap->addr, .len = uap->len, }; /* Convert flags. */ switch (uap->flags & (CLOUDABI_MS_ASYNC | CLOUDABI_MS_SYNC)) { case CLOUDABI_MS_ASYNC: msync_args.flags |= MS_ASYNC; break; case CLOUDABI_MS_SYNC: msync_args.flags |= MS_SYNC; break; default: return (EINVAL); } if ((uap->flags & CLOUDABI_MS_INVALIDATE) != 0) msync_args.flags |= MS_INVALIDATE; return (sys_msync(td, &msync_args)); } int cloudabi_sys_mem_unlock(struct thread *td, struct cloudabi_sys_mem_unlock_args *uap) { struct munlock_args munlock_args = { .addr = uap->addr, .len = uap->len }; return (sys_munlock(td, &munlock_args)); } int cloudabi_sys_mem_unmap(struct thread *td, struct cloudabi_sys_mem_unmap_args *uap) { struct munmap_args munmap_args = { .addr = uap->addr, .len = uap->len }; return (sys_munmap(td, &munmap_args)); }