Esempio n. 1
0
static int restore_thread_common(struct rt_sigframe *sigframe,
		struct thread_restore_args *args)
{
	sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr));

	if (args->has_futex && args->futex_rla_len) {
		int ret;

		ret = sys_set_robust_list(decode_pointer(args->futex_rla),
					  args->futex_rla_len);
		if (ret) {
			pr_err("Failed to recover futex robust list: %d\n", ret);
			return -1;
		}
	}

	restore_sched_info(&args->sp);

	if (restore_nonsigframe_gpregs(&args->gpregs))
		return -1;

	restore_tls(&args->tls);

	return 0;
}
Esempio n. 2
0
static unsigned long restore_mapping(const VmaEntry *vma_entry)
{
	int prot	= vma_entry->prot;
	int flags	= vma_entry->flags | MAP_FIXED;
	unsigned long addr;

	if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
		return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
				 (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);

	/*
	 * Restore or shared mappings are tricky, since
	 * we open anonymous mapping via map_files/
	 * MAP_ANONYMOUS should be eliminated so fd would
	 * be taken into account by a kernel.
	 */
	if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
		flags &= ~MAP_ANONYMOUS;

	/* A mapping of file with MAP_SHARED is up to date */
	if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
		prot |= PROT_WRITE;

	pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
			vma_entry->start, vma_entry->end,
			prot, flags, (int)vma_entry->fd);
	/*
	 * Should map memory here. Note we map them as
	 * writable since we're going to restore page
	 * contents.
	 */
	addr = sys_mmap(decode_pointer(vma_entry->start),
			vma_entry_len(vma_entry),
			prot, flags,
			vma_entry->fd,
			vma_entry->pgoff);

	if (vma_entry->fd != -1)
		sys_close(vma_entry->fd);

	return addr;
}
Esempio n. 3
0
static inline void setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas)
{
	if (sas) {
#define UC	RT_SIGFRAME_UC(sigframe)

		UC->uc_stack.ss_sp	= (void *)decode_pointer((sas)->ss_sp);
		UC->uc_stack.ss_flags	= (int)(sas)->ss_flags;
		UC->uc_stack.ss_size	= (size_t)(sas)->ss_size;
#undef UC
	}
}
Esempio n. 4
0
int
cb_erase(critbit_tree* cb, const void* key, size_t keylen) {
  void** iter = NULL;
  void* ptr = cb->root;
  struct critbit_node* parent = 0;
  unsigned char* bytes = (unsigned char*)key;
  int branch = 0;

  if(!cb->root)
    return CB_NOTFOUND;

  for(;;) {
    int type;

    type = decode_pointer(&ptr);
    if(type == INTERNAL_NODE) {
      iter = parent ? &parent->child[branch] : &cb->root;
      parent = (struct critbit_node*)ptr;
      branch = (keylen <= parent->byte) ? 0 : ((1 + ((bytes[parent->byte] | parent->mask) & 0xFF)) >> 8);
      ptr = parent->child[branch];
    } else {
Esempio n. 5
0
static int
cb_foreach_i(void* ptr, const void* key, size_t keylen, int (*match_cb)(const void* match, const void* key, size_t keylen, void*), void* data) {
  int result = 0;

  if(decode_pointer(&ptr) == INTERNAL_NODE) {
    struct critbit_node* node = (struct critbit_node*)ptr;
    result = cb_foreach_i(node->child[0], key, keylen, match_cb, data);
    if(!result) {
      result = cb_foreach_i(node->child[1], key, keylen, match_cb, data);
    }
  } else {
    /* reached an external node */
    void* match;
    size_t len;

    from_external_node(ptr, &match, &len);
    if(len >= keylen && byte_diff(key, keylen, match) == 0) {
      return match_cb(match, key, keylen, data);
    }
  }
  return result;
}
Esempio n. 6
0
static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
{
	iov->iov_base = decode_pointer(ps->vaddr);
	iov->iov_len = ps->nr_pages * PAGE_SIZE;
}
Esempio n. 7
0
static int restore_shmem_content(void *addr, struct shmem_info *si)
{
	int fd, fd_pg, ret = 0;

	fd = open_image(CR_FD_SHMEM_PAGEMAP, O_RSTR, si->shmid);
	if (fd < 0) {
		fd_pg = open_image(CR_FD_SHM_PAGES_OLD, O_RSTR, si->shmid);
		if (fd_pg < 0)
			goto err_unmap;
	} else {
		fd_pg = open_pages_image(O_RSTR, fd);
		if (fd_pg < 0)
			goto out_close;
	}

	while (1) {
		unsigned long vaddr;
		unsigned nr_pages;

		if (fd >= 0) {
			PagemapEntry *pe;

			ret = pb_read_one_eof(fd, &pe, PB_PAGEMAP);
			if (ret <= 0)
				break;

			vaddr = (unsigned long)decode_pointer(pe->vaddr);
			nr_pages = pe->nr_pages;

			pagemap_entry__free_unpacked(pe, NULL);
		} else {
			__u64 img_vaddr;

			ret = read_img_eof(fd_pg, &img_vaddr);
			if (ret <= 0)
				break;

			vaddr = (unsigned long)decode_pointer(img_vaddr);
			nr_pages = 1;
		}

		if (vaddr + nr_pages * PAGE_SIZE > si->size)
			break;

		ret = read(fd_pg, addr + vaddr, nr_pages * PAGE_SIZE);
		if (ret != nr_pages * PAGE_SIZE) {
			ret = -1;
			break;
		}

	}

	close_safe(&fd_pg);
	close_safe(&fd);
	return ret;

out_close:
	close_safe(&fd);
err_unmap:
	munmap(addr,  si->size);
	return -1;
}
Esempio n. 8
0
/*
 * The main routine to restore task via sigreturn.
 * This one is very special, we never return there
 * but use sigreturn facility to restore core registers
 * and jump execution to some predefined ip read from
 * core file.
 */
long __export_restore_task(struct task_restore_args *args)
{
	long ret = -1;
	int i;
	VmaEntry *vma_entry;
	unsigned long va;

	struct rt_sigframe *rt_sigframe;
	unsigned long new_sp;
	k_rtsigset_t to_block;
	pid_t my_pid = sys_getpid();
	rt_sigaction_t act;

	bootstrap_start = args->bootstrap_start;
	bootstrap_len	= args->bootstrap_len;

#ifdef CONFIG_VDSO
	vdso_rt_size	= args->vdso_rt_size;
#endif

	task_entries = args->task_entries;
	helpers = args->helpers;
	n_helpers = args->n_helpers;
	*args->breakpoint = rst_sigreturn;

	ksigfillset(&act.rt_sa_mask);
	act.rt_sa_handler = sigchld_handler;
	act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
	act.rt_sa_restorer = cr_restore_rt;
	sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));

	log_set_fd(args->logfd);
	log_set_loglevel(args->loglevel);

	cap_last_cap = args->cap_last_cap;

	pr_info("Switched to the restorer %d\n", my_pid);

#ifdef CONFIG_VDSO
	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
		goto core_restore_end;
#endif

	if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
				bootstrap_start, bootstrap_len))
		goto core_restore_end;

	/* Shift private vma-s to the left */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->start > vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/* Shift private vma-s to the right */
	for (i = args->nr_vmas - 1; i >= 0; i--) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->start > TASK_SIZE)
			continue;

		if (vma_entry->start < vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/*
	 * OK, lets try to map new one.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (vma_priv(vma_entry))
			continue;

		va = restore_mapping(vma_entry);

		if (va != vma_entry->start) {
			pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va);
			goto core_restore_end;
		}
	}

#ifdef CONFIG_VDSO
	/*
	 * Proxify vDSO.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		if (vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VDSO) ||
		    vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VVAR)) {
			if (vdso_proxify("dumpee", &args->vdso_sym_rt,
					 args->vdso_rt_parked_at,
					 i, args->tgt_vmas, args->nr_vmas))
				goto core_restore_end;
			break;
		}
	}
#endif

	/*
	 * Walk though all VMAs again to drop PROT_WRITE
	 * if it was not there.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
			continue;

		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
			struct shmem_info *entry;

			entry = find_shmem(args->shmems, args->nr_shmems,
						  vma_entry->shmid);
			if (entry && entry->pid == my_pid &&
			    entry->start == vma_entry->start)
				futex_set_and_wake(&entry->lock, 1);
		}

		if (vma_entry->prot & PROT_WRITE)
			continue;

		sys_mprotect(decode_pointer(vma_entry->start),
			     vma_entry_len(vma_entry),
			     vma_entry->prot);
	}

	/*
	 * Finally restore madivse() bits
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		unsigned long m;

		vma_entry = args->tgt_vmas + i;
		if (!vma_entry->has_madv || !vma_entry->madv)
			continue;

		for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
			if (vma_entry->madv & (1ul << m)) {
				ret = sys_madvise(vma_entry->start,
						  vma_entry_len(vma_entry),
						  m);
				if (ret) {
					pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
					       "failed with %ld\n",
						vma_entry->start,
						vma_entry_len(vma_entry),
						m, ret);
					goto core_restore_end;
				}
			}
		}
	}

	ret = 0;

	/*
	 * Tune up the task fields.
	 */
	ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);

	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE,	(long)args->mm.mm_start_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE,	(long)args->mm.mm_end_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA,	(long)args->mm.mm_start_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA,	(long)args->mm.mm_end_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK,	(long)args->mm.mm_start_stack, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK,	(long)args->mm.mm_start_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK,		(long)args->mm.mm_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START,	(long)args->mm.mm_arg_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END,	(long)args->mm.mm_arg_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START,	(long)args->mm.mm_env_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END,	(long)args->mm.mm_env_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV,	(long)args->mm_saved_auxv, args->mm_saved_auxv_size);
	if (ret)
		goto core_restore_end;

	/*
	 * Because of requirements applied from kernel side
	 * we need to restore /proc/pid/exe symlink late,
	 * after old existing VMAs are superseded with
	 * new ones from image file.
	 */
	ret = restore_self_exe_late(args);
	if (ret)
		goto core_restore_end;

	/*
	 * We need to prepare a valid sigframe here, so
	 * after sigreturn the kernel will pick up the
	 * registers from the frame, set them up and
	 * finally pass execution to the new IP.
	 */
	rt_sigframe = (void *)args->t->mem_zone.rt_sigframe;

	if (restore_thread_common(rt_sigframe, args->t))
		goto core_restore_end;

	/*
	 * Threads restoration. This requires some more comments. This
	 * restorer routine and thread restorer routine has the following
	 * memory map, prepared by a caller code.
	 *
	 * | <-- low addresses                                          high addresses --> |
	 * +-------------------------------------------------------+-----------------------+
	 * | this proc body | own stack | rt_sigframe space | thread restore zone   |
	 * +-------------------------------------------------------+-----------------------+
	 *
	 * where each thread restore zone is the following
	 *
	 * | <-- low addresses                                     high addresses --> |
	 * +--------------------------------------------------------------------------+
	 * | thread restore proc | thread1 stack | thread1 rt_sigframe |
	 * +--------------------------------------------------------------------------+
	 */

	if (args->nr_threads > 1) {
		struct thread_restore_args *thread_args = args->thread_args;
		long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND	|
				   CLONE_THREAD | CLONE_SYSVSEM;
		long last_pid_len;
		long parent_tid;
		int i, fd;

		fd = args->fd_last_pid;
		ret = sys_flock(fd, LOCK_EX);
		if (ret) {
			pr_err("Can't lock last_pid %d\n", fd);
			goto core_restore_end;
		}

		for (i = 0; i < args->nr_threads; i++) {
			char last_pid_buf[16], *s;

			/* skip self */
			if (thread_args[i].pid == args->t->pid)
				continue;

			new_sp = restorer_stack(thread_args + i);
			last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
			sys_lseek(fd, 0, SEEK_SET);
			ret = sys_write(fd, s, last_pid_len);
			if (ret < 0) {
				pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
				goto core_restore_end;
			}

			/*
			 * To achieve functionality like libc's clone()
			 * we need a pure assembly here, because clone()'ed
			 * thread will run with own stack and we must not
			 * have any additional instructions... oh, dear...
			 */

			RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
		}

		ret = sys_flock(fd, LOCK_UN);
		if (ret) {
			pr_err("Can't unlock last_pid %ld\n", ret);
			goto core_restore_end;
		}

	}

	sys_close(args->fd_last_pid);

	restore_rlims(args);

	ret = create_posix_timers(args);
	if (ret < 0) {
		pr_err("Can't restore posix timers %ld\n", ret);
		goto core_restore_end;
	}

	ret = timerfd_arm(args);
	if (ret < 0) {
		pr_err("Can't restore timerfd %ld\n", ret);
		goto core_restore_end;
	}

	pr_info("%ld: Restored\n", sys_getpid());

	futex_set(&zombies_inprogress, args->nr_zombies);

	restore_finish_stage(CR_STATE_RESTORE);

	futex_wait_while_gt(&zombies_inprogress, 0);

	if (wait_helpers(args) < 0)
		goto core_restore_end;

	ksigfillset(&to_block);
	ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
	if (ret) {
		pr_err("Unable to block signals %ld", ret);
		goto core_restore_end;
	}

	sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));

	ret = restore_signals(args->siginfo, args->siginfo_nr, true);
	if (ret)
		goto core_restore_end;

	ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false);
	if (ret)
		goto core_restore_end;

	restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);

	rst_tcp_socks_all(args);

	/*
	 * Writing to last-pid is CAP_SYS_ADMIN protected,
	 * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
	 * thus restore* creds _after_ all of the above.
	 */

	ret = restore_creds(&args->creds);
	ret = ret || restore_dumpable_flag(&args->mm);
	ret = ret || restore_pdeath_sig(args->t);

	futex_set_and_wake(&thread_inprogress, args->nr_threads);

	restore_finish_stage(CR_STATE_RESTORE_CREDS);

	if (ret)
		BUG();

	/* Wait until children stop to use args->task_entries */
	futex_wait_while_gt(&thread_inprogress, 1);

	log_set_fd(-1);

	/*
	 * The code that prepared the itimers makes shure the
	 * code below doesn't fail due to bad timing values.
	 */

#define itimer_armed(args, i)				\
		(args->itimers[i].it_interval.tv_sec ||	\
		 args->itimers[i].it_interval.tv_usec)

	if (itimer_armed(args, 0))
		sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
	if (itimer_armed(args, 1))
		sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
	if (itimer_armed(args, 2))
		sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);

	restore_posix_timers(args);

	sys_munmap(args->rst_mem, args->rst_mem_size);

	/*
	 * Sigframe stack.
	 */
	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;

	/*
	 * Prepare the stack and call for sigreturn,
	 * pure assembly since we don't need any additional
	 * code insns from gcc.
	 */
	rst_sigreturn(new_sp);

core_restore_end:
	futex_abort_and_wake(&task_entries->nr_in_progress);
	pr_err("Restorer fail %ld\n", sys_getpid());
	sys_exit_group(1);
	return -1;
}
Esempio n. 9
0
/*
 * The main routine to restore task via sigreturn.
 * This one is very special, we never return there
 * but use sigreturn facility to restore core registers
 * and jump execution to some predefined ip read from
 * core file.
 */
long __export_restore_task(struct task_restore_core_args *args)
{
	long ret = -1;
	VmaEntry *vma_entry;
	u64 va;
	unsigned long premmapped_end = args->premmapped_addr + args->premmapped_len;

	struct rt_sigframe *rt_sigframe;
	unsigned long new_sp;
	pid_t my_pid = sys_getpid();
	rt_sigaction_t act;

	task_entries = args->task_entries;

	ksigfillset(&act.rt_sa_mask);
	act.rt_sa_handler = sigchld_handler;
	act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
	act.rt_sa_restorer = cr_restore_rt;
	sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));

	log_set_fd(args->logfd);
	log_set_loglevel(args->loglevel);

	cap_last_cap = args->cap_last_cap;

	pr_info("Switched to the restorer %d\n", my_pid);

	for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) {
		unsigned long addr = vma_entry->start;
		unsigned long len;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		pr_debug("Examine %"PRIx64"-%"PRIx64"\n", vma_entry->start, vma_entry->end);

		if (addr < args->premmapped_addr) {
			if (vma_entry->end >= args->premmapped_addr)
				len = args->premmapped_addr - addr;
			else
				len = vma_entry->end - vma_entry->start;
			if (sys_munmap((void *) addr, len)) {
				pr_err("munmap fail for %lx - %lx\n", addr, addr + len);
				goto core_restore_end;
			}
		}

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->end > premmapped_end) {
			if (vma_entry->start < premmapped_end)
				addr = premmapped_end;
			len = vma_entry->end - addr;
			if (sys_munmap((void *) addr, len)) {
				pr_err("munmap fail for %lx - %lx\n", addr, addr + len);
				goto core_restore_end;
			}
		}
	}

	sys_munmap(args->self_vmas,
			((void *)(vma_entry + 1) - ((void *)args->self_vmas)));

	/* Shift private vma-s to the left */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->start > vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/* Shift private vma-s to the right */
	for (vma_entry = args->tgt_vmas + args->nr_vmas -1;
				vma_entry >= args->tgt_vmas; vma_entry--) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->start > TASK_SIZE)
			continue;

		if (vma_entry->start < vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/*
	 * OK, lets try to map new one.
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (vma_priv(vma_entry))
			continue;

		va = restore_mapping(vma_entry);

		if (va != vma_entry->start) {
			pr_err("Can't restore %"PRIx64" mapping with %"PRIx64"\n", vma_entry->start, va);
			goto core_restore_end;
		}
	}

	/*
	 * Walk though all VMAs again to drop PROT_WRITE
	 * if it was not there.
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
			continue;

		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
			struct shmem_info *entry;

			entry = find_shmem(args->shmems,
						  vma_entry->shmid);
			if (entry && entry->pid == my_pid &&
			    entry->start == vma_entry->start)
				futex_set_and_wake(&entry->lock, 1);
		}

		if (vma_entry->prot & PROT_WRITE)
			continue;

		sys_mprotect(decode_pointer(vma_entry->start),
			     vma_entry_len(vma_entry),
			     vma_entry->prot);
	}

	/*
	 * Finally restore madivse() bits
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		unsigned long i;

		if (!vma_entry->has_madv || !vma_entry->madv)
			continue;
		for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) {
			if (vma_entry->madv & (1ul << i)) {
				ret = sys_madvise(vma_entry->start,
						  vma_entry_len(vma_entry),
						  i);
				if (ret) {
					pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
					       "failed with %ld\n",
						vma_entry->start,
						vma_entry_len(vma_entry),
						i, ret);
					goto core_restore_end;
				}
			}
		}
	}

	sys_munmap(args->tgt_vmas,
			((void *)(vma_entry + 1) - ((void *)args->tgt_vmas)));

	ret = sys_munmap(args->shmems, SHMEMS_SIZE);
	if (ret < 0) {
		pr_err("Can't unmap shmem %ld\n", ret);
		goto core_restore_end;
	}

	/*
	 * Tune up the task fields.
	 */
	ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);

	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE,	(long)args->mm.mm_start_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE,	(long)args->mm.mm_end_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA,	(long)args->mm.mm_start_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA,	(long)args->mm.mm_end_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK,	(long)args->mm.mm_start_stack, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK,	(long)args->mm.mm_start_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK,		(long)args->mm.mm_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START,	(long)args->mm.mm_arg_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END,	(long)args->mm.mm_arg_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START,	(long)args->mm.mm_env_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END,	(long)args->mm.mm_env_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV,	(long)args->mm_saved_auxv, args->mm_saved_auxv_size);
	if (ret)
		goto core_restore_end;

	/*
	 * Because of requirements applied from kernel side
	 * we need to restore /proc/pid/exe symlink late,
	 * after old existing VMAs are superseded with
	 * new ones from image file.
	 */
	ret = restore_self_exe_late(args);
	if (ret)
		goto core_restore_end;

	/*
	 * We need to prepare a valid sigframe here, so
	 * after sigreturn the kernel will pick up the
	 * registers from the frame, set them up and
	 * finally pass execution to the new IP.
	 */
	rt_sigframe = (void *)args->t->mem_zone.rt_sigframe + 8;

	if (restore_thread_common(rt_sigframe, args->t))
		goto core_restore_end;

	/*
	 * Threads restoration. This requires some more comments. This
	 * restorer routine and thread restorer routine has the following
	 * memory map, prepared by a caller code.
	 *
	 * | <-- low addresses                                          high addresses --> |
	 * +-------------------------------------------------------+-----------------------+
	 * | this proc body | own stack | heap | rt_sigframe space | thread restore zone   |
	 * +-------------------------------------------------------+-----------------------+
	 *
	 * where each thread restore zone is the following
	 *
	 * | <-- low addresses                                     high addresses --> |
	 * +--------------------------------------------------------------------------+
	 * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe |
	 * +--------------------------------------------------------------------------+
	 */

	if (args->nr_threads > 1) {
		struct thread_restore_args *thread_args = args->thread_args;
		long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND	|
				   CLONE_THREAD | CLONE_SYSVSEM;
		long last_pid_len;
		long parent_tid;
		int i, fd;

		fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM);
		if (fd < 0) {
			pr_err("Can't open last_pid %d\n", fd);
			goto core_restore_end;
		}

		ret = sys_flock(fd, LOCK_EX);
		if (ret) {
			pr_err("Can't lock last_pid %d\n", fd);
			goto core_restore_end;
		}

		for (i = 0; i < args->nr_threads; i++) {
			char last_pid_buf[16], *s;

			/* skip self */
			if (thread_args[i].pid == args->t->pid)
				continue;

			mutex_lock(&args->rst_lock);

			new_sp =
				RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack,
						    sizeof(thread_args[i].mem_zone.stack));

			last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
			ret = sys_write(fd, s, last_pid_len);
			if (ret < 0) {
				pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
				goto core_restore_end;
			}

			/*
			 * To achieve functionality like libc's clone()
			 * we need a pure assembly here, because clone()'ed
			 * thread will run with own stack and we must not
			 * have any additional instructions... oh, dear...
			 */

			RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
		}

		ret = sys_flock(fd, LOCK_UN);
		if (ret) {
			pr_err("Can't unlock last_pid %ld\n", ret);
			goto core_restore_end;
		}

		sys_close(fd);
	}

	restore_rlims(args);

	pr_info("%ld: Restored\n", sys_getpid());

	futex_set(&zombies_inprogress, args->nr_zombies);

	restore_finish_stage(CR_STATE_RESTORE);

	futex_wait_while_gt(&zombies_inprogress, 0);

	sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));

	ret = restore_signals(args->siginfo, args->siginfo_nr, true);
	if (ret)
		goto core_restore_end;

	ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false);
	if (ret)
		goto core_restore_end;

	restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);

	if (args->siginfo_size) {
		ret = sys_munmap(args->siginfo, args->siginfo_size);
		if (ret < 0) {
			pr_err("Can't unmap signals %ld\n", ret);
			goto core_restore_failed;
		}
	}

	rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size);

	/* 
	 * Writing to last-pid is CAP_SYS_ADMIN protected,
	 * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
	 * thus restore* creds _after_ all of the above.
	 */

	ret = restore_creds(&args->creds);

	futex_set_and_wake(&thread_inprogress, args->nr_threads);

	restore_finish_stage(CR_STATE_RESTORE_CREDS);

	if (ret)
		BUG();

	/* Wait until children stop to use args->task_entries */
	futex_wait_while_gt(&thread_inprogress, 1);

	log_set_fd(-1);

	/*
	 * The code that prepared the itimers makes shure the
	 * code below doesn't fail due to bad timing values.
	 */

#define itimer_armed(args, i)				\
		(args->itimers[i].it_interval.tv_sec ||	\
		 args->itimers[i].it_interval.tv_usec)

	if (itimer_armed(args, 0))
		sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
	if (itimer_armed(args, 1))
		sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
	if (itimer_armed(args, 2))
		sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);

	ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE);
	if (ret < 0) {
		ret = ((long)__LINE__ << 16) | ((-ret) & 0xffff);
		goto core_restore_failed;
	}

	/*
	 * Sigframe stack.
	 */
	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;

	/*
	 * Prepare the stack and call for sigreturn,
	 * pure assembly since we don't need any additional
	 * code insns from gcc.
	 */
	ARCH_RT_SIGRETURN(new_sp);

core_restore_end:
	futex_abort_and_wake(&task_entries->nr_in_progress);
	pr_err("Restorer fail %ld\n", sys_getpid());
	sys_exit_group(1);
	return -1;

core_restore_failed:
	ARCH_FAIL_CORE_RESTORE;

	return ret;
}