Exemple #1
0
static void event_set_signal (unsigned long long cts, unsigned long long nts)
{
#if defined(have_sys_alarm)
    sys_alarm (nts - cts);
#elif defined(have_sys_setitimer)
    struct sc_val { long sec; long usec; } tv = { nts - cts, 0 };
    struct ti_val { struct sc_val i; struct sc_val c; } ti = { tv, tv };

    sys_setitimer (0 /* ITIMER_REAL */, &ti, 0);
#endif
}
Exemple #2
0
/* Note:
   for SCO OpenServer 5.0.6 the original nap was fixed so that it
   no longer waits a minimum of 2 tick (20ms)
   but fewer time with a 10 ms granularity */
long
xnx_nap(long period)
{
#ifdef USE_NEW_NAP_CODE
	// Hz means the number of jiffies per second.
	// the below code needs HZ to be 1 <= HZ <= 1000
	// in order to work correctly in any case.
#if HZ > 1000
#error this code only works with HZ <= 1000
#endif
	struct timeval tv1, tv2;
	struct timezone tz;
	mm_segment_t oldfs;
	long period_s; // seconds part
	long period_ms_hz; // milli seconds part, scaled to a base of HZ
	long period_j; // jiffies

	if (!period)
		return 0; // zereo request, zero reply

	oldfs = get_fs();
	set_fs(get_ds());
	sys_gettimeofday(&tv1, &tz);

	period_s = period / 1000;
	period_ms_hz = (period - period_s * 1000) * HZ;
	period_j = period_s * HZ + period_ms_hz / 1000;
	// take care of rounding errors, round up
	if (period > period_j * (1000 / HZ)) period_j++;

	set_current_state(TASK_INTERRUPTIBLE);
	schedule_timeout (period_j);

	sys_gettimeofday(&tv2, &tz);
	set_fs(oldfs);

	if (signal_pending(current))
		return -EINTR; // interrupted
	return (tv2.tv_sec - tv1.tv_sec) * 1000
		+ (tv2.tv_usec - tv1.tv_usec + 500) / 1000;
#else
	__sighandler_t old_handler;
	struct itimerval it;
	struct timeval tv1, tv2;
	struct timezone tz;
	mm_segment_t oldfs;

	if (!period)
		return 0;

	it.it_interval.tv_sec = 0;
	it.it_interval.tv_usec = 0;
	it.it_value.tv_sec = 0;
	it.it_value.tv_usec = period * 1000;

	oldfs = get_fs();
	set_fs(get_ds());

	sys_gettimeofday(&tv1, &tz);
	old_handler = sigaction(SIGALRM, SIG_DFL); // SIG_DFL -> terminate
	sys_setitimer(ITIMER_REAL, &it, NULL);
	sys_pause();
	sigaction(SIGALRM, old_handler);
	sys_gettimeofday(&tv2, &tz);
	set_fs(oldfs);

	deactivate_signal(current, SIGALRM);

	if (signal_pending(current))
		return -EINTR;
	return ((tv2.tv_sec - tv1.tv_sec) * 1000000
			+ (tv2.tv_usec - tv1.tv_usec)) / 1000;
#endif
}
Exemple #3
0
/*
 * The main routine to restore task via sigreturn.
 * This one is very special, we never return there
 * but use sigreturn facility to restore core registers
 * and jump execution to some predefined ip read from
 * core file.
 */
long __export_restore_task(struct task_restore_args *args)
{
	long ret = -1;
	int i;
	VmaEntry *vma_entry;
	unsigned long va;

	struct rt_sigframe *rt_sigframe;
	unsigned long new_sp;
	k_rtsigset_t to_block;
	pid_t my_pid = sys_getpid();
	rt_sigaction_t act;

	bootstrap_start = args->bootstrap_start;
	bootstrap_len	= args->bootstrap_len;

#ifdef CONFIG_VDSO
	vdso_rt_size	= args->vdso_rt_size;
#endif

	task_entries = args->task_entries;
	helpers = args->helpers;
	n_helpers = args->n_helpers;
	*args->breakpoint = rst_sigreturn;

	ksigfillset(&act.rt_sa_mask);
	act.rt_sa_handler = sigchld_handler;
	act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
	act.rt_sa_restorer = cr_restore_rt;
	sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));

	log_set_fd(args->logfd);
	log_set_loglevel(args->loglevel);

	cap_last_cap = args->cap_last_cap;

	pr_info("Switched to the restorer %d\n", my_pid);

#ifdef CONFIG_VDSO
	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
		goto core_restore_end;
#endif

	if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
				bootstrap_start, bootstrap_len))
		goto core_restore_end;

	/* Shift private vma-s to the left */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->start > vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/* Shift private vma-s to the right */
	for (i = args->nr_vmas - 1; i >= 0; i--) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->start > TASK_SIZE)
			continue;

		if (vma_entry->start < vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/*
	 * OK, lets try to map new one.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (vma_priv(vma_entry))
			continue;

		va = restore_mapping(vma_entry);

		if (va != vma_entry->start) {
			pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va);
			goto core_restore_end;
		}
	}

#ifdef CONFIG_VDSO
	/*
	 * Proxify vDSO.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		if (vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VDSO) ||
		    vma_entry_is(&args->tgt_vmas[i], VMA_AREA_VVAR)) {
			if (vdso_proxify("dumpee", &args->vdso_sym_rt,
					 args->vdso_rt_parked_at,
					 i, args->tgt_vmas, args->nr_vmas))
				goto core_restore_end;
			break;
		}
	}
#endif

	/*
	 * Walk though all VMAs again to drop PROT_WRITE
	 * if it was not there.
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		vma_entry = args->tgt_vmas + i;

		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
			continue;

		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
			struct shmem_info *entry;

			entry = find_shmem(args->shmems, args->nr_shmems,
						  vma_entry->shmid);
			if (entry && entry->pid == my_pid &&
			    entry->start == vma_entry->start)
				futex_set_and_wake(&entry->lock, 1);
		}

		if (vma_entry->prot & PROT_WRITE)
			continue;

		sys_mprotect(decode_pointer(vma_entry->start),
			     vma_entry_len(vma_entry),
			     vma_entry->prot);
	}

	/*
	 * Finally restore madivse() bits
	 */
	for (i = 0; i < args->nr_vmas; i++) {
		unsigned long m;

		vma_entry = args->tgt_vmas + i;
		if (!vma_entry->has_madv || !vma_entry->madv)
			continue;

		for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
			if (vma_entry->madv & (1ul << m)) {
				ret = sys_madvise(vma_entry->start,
						  vma_entry_len(vma_entry),
						  m);
				if (ret) {
					pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
					       "failed with %ld\n",
						vma_entry->start,
						vma_entry_len(vma_entry),
						m, ret);
					goto core_restore_end;
				}
			}
		}
	}

	ret = 0;

	/*
	 * Tune up the task fields.
	 */
	ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);

	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE,	(long)args->mm.mm_start_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE,	(long)args->mm.mm_end_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA,	(long)args->mm.mm_start_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA,	(long)args->mm.mm_end_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK,	(long)args->mm.mm_start_stack, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK,	(long)args->mm.mm_start_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK,		(long)args->mm.mm_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START,	(long)args->mm.mm_arg_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END,	(long)args->mm.mm_arg_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START,	(long)args->mm.mm_env_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END,	(long)args->mm.mm_env_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV,	(long)args->mm_saved_auxv, args->mm_saved_auxv_size);
	if (ret)
		goto core_restore_end;

	/*
	 * Because of requirements applied from kernel side
	 * we need to restore /proc/pid/exe symlink late,
	 * after old existing VMAs are superseded with
	 * new ones from image file.
	 */
	ret = restore_self_exe_late(args);
	if (ret)
		goto core_restore_end;

	/*
	 * We need to prepare a valid sigframe here, so
	 * after sigreturn the kernel will pick up the
	 * registers from the frame, set them up and
	 * finally pass execution to the new IP.
	 */
	rt_sigframe = (void *)args->t->mem_zone.rt_sigframe;

	if (restore_thread_common(rt_sigframe, args->t))
		goto core_restore_end;

	/*
	 * Threads restoration. This requires some more comments. This
	 * restorer routine and thread restorer routine has the following
	 * memory map, prepared by a caller code.
	 *
	 * | <-- low addresses                                          high addresses --> |
	 * +-------------------------------------------------------+-----------------------+
	 * | this proc body | own stack | rt_sigframe space | thread restore zone   |
	 * +-------------------------------------------------------+-----------------------+
	 *
	 * where each thread restore zone is the following
	 *
	 * | <-- low addresses                                     high addresses --> |
	 * +--------------------------------------------------------------------------+
	 * | thread restore proc | thread1 stack | thread1 rt_sigframe |
	 * +--------------------------------------------------------------------------+
	 */

	if (args->nr_threads > 1) {
		struct thread_restore_args *thread_args = args->thread_args;
		long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND	|
				   CLONE_THREAD | CLONE_SYSVSEM;
		long last_pid_len;
		long parent_tid;
		int i, fd;

		fd = args->fd_last_pid;
		ret = sys_flock(fd, LOCK_EX);
		if (ret) {
			pr_err("Can't lock last_pid %d\n", fd);
			goto core_restore_end;
		}

		for (i = 0; i < args->nr_threads; i++) {
			char last_pid_buf[16], *s;

			/* skip self */
			if (thread_args[i].pid == args->t->pid)
				continue;

			new_sp = restorer_stack(thread_args + i);
			last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
			sys_lseek(fd, 0, SEEK_SET);
			ret = sys_write(fd, s, last_pid_len);
			if (ret < 0) {
				pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
				goto core_restore_end;
			}

			/*
			 * To achieve functionality like libc's clone()
			 * we need a pure assembly here, because clone()'ed
			 * thread will run with own stack and we must not
			 * have any additional instructions... oh, dear...
			 */

			RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
		}

		ret = sys_flock(fd, LOCK_UN);
		if (ret) {
			pr_err("Can't unlock last_pid %ld\n", ret);
			goto core_restore_end;
		}

	}

	sys_close(args->fd_last_pid);

	restore_rlims(args);

	ret = create_posix_timers(args);
	if (ret < 0) {
		pr_err("Can't restore posix timers %ld\n", ret);
		goto core_restore_end;
	}

	ret = timerfd_arm(args);
	if (ret < 0) {
		pr_err("Can't restore timerfd %ld\n", ret);
		goto core_restore_end;
	}

	pr_info("%ld: Restored\n", sys_getpid());

	futex_set(&zombies_inprogress, args->nr_zombies);

	restore_finish_stage(CR_STATE_RESTORE);

	futex_wait_while_gt(&zombies_inprogress, 0);

	if (wait_helpers(args) < 0)
		goto core_restore_end;

	ksigfillset(&to_block);
	ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
	if (ret) {
		pr_err("Unable to block signals %ld", ret);
		goto core_restore_end;
	}

	sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));

	ret = restore_signals(args->siginfo, args->siginfo_nr, true);
	if (ret)
		goto core_restore_end;

	ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false);
	if (ret)
		goto core_restore_end;

	restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);

	rst_tcp_socks_all(args);

	/*
	 * Writing to last-pid is CAP_SYS_ADMIN protected,
	 * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
	 * thus restore* creds _after_ all of the above.
	 */

	ret = restore_creds(&args->creds);
	ret = ret || restore_dumpable_flag(&args->mm);
	ret = ret || restore_pdeath_sig(args->t);

	futex_set_and_wake(&thread_inprogress, args->nr_threads);

	restore_finish_stage(CR_STATE_RESTORE_CREDS);

	if (ret)
		BUG();

	/* Wait until children stop to use args->task_entries */
	futex_wait_while_gt(&thread_inprogress, 1);

	log_set_fd(-1);

	/*
	 * The code that prepared the itimers makes shure the
	 * code below doesn't fail due to bad timing values.
	 */

#define itimer_armed(args, i)				\
		(args->itimers[i].it_interval.tv_sec ||	\
		 args->itimers[i].it_interval.tv_usec)

	if (itimer_armed(args, 0))
		sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
	if (itimer_armed(args, 1))
		sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
	if (itimer_armed(args, 2))
		sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);

	restore_posix_timers(args);

	sys_munmap(args->rst_mem, args->rst_mem_size);

	/*
	 * Sigframe stack.
	 */
	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;

	/*
	 * Prepare the stack and call for sigreturn,
	 * pure assembly since we don't need any additional
	 * code insns from gcc.
	 */
	rst_sigreturn(new_sp);

core_restore_end:
	futex_abort_and_wake(&task_entries->nr_in_progress);
	pr_err("Restorer fail %ld\n", sys_getpid());
	sys_exit_group(1);
	return -1;
}
Exemple #4
0
/*
 * The main routine to restore task via sigreturn.
 * This one is very special, we never return there
 * but use sigreturn facility to restore core registers
 * and jump execution to some predefined ip read from
 * core file.
 */
long __export_restore_task(struct task_restore_core_args *args)
{
	long ret = -1;
	VmaEntry *vma_entry;
	u64 va;
	unsigned long premmapped_end = args->premmapped_addr + args->premmapped_len;

	struct rt_sigframe *rt_sigframe;
	unsigned long new_sp;
	pid_t my_pid = sys_getpid();
	rt_sigaction_t act;

	task_entries = args->task_entries;

	ksigfillset(&act.rt_sa_mask);
	act.rt_sa_handler = sigchld_handler;
	act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
	act.rt_sa_restorer = cr_restore_rt;
	sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));

	log_set_fd(args->logfd);
	log_set_loglevel(args->loglevel);

	cap_last_cap = args->cap_last_cap;

	pr_info("Switched to the restorer %d\n", my_pid);

	for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) {
		unsigned long addr = vma_entry->start;
		unsigned long len;

		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		pr_debug("Examine %"PRIx64"-%"PRIx64"\n", vma_entry->start, vma_entry->end);

		if (addr < args->premmapped_addr) {
			if (vma_entry->end >= args->premmapped_addr)
				len = args->premmapped_addr - addr;
			else
				len = vma_entry->end - vma_entry->start;
			if (sys_munmap((void *) addr, len)) {
				pr_err("munmap fail for %lx - %lx\n", addr, addr + len);
				goto core_restore_end;
			}
		}

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->end > premmapped_end) {
			if (vma_entry->start < premmapped_end)
				addr = premmapped_end;
			len = vma_entry->end - addr;
			if (sys_munmap((void *) addr, len)) {
				pr_err("munmap fail for %lx - %lx\n", addr, addr + len);
				goto core_restore_end;
			}
		}
	}

	sys_munmap(args->self_vmas,
			((void *)(vma_entry + 1) - ((void *)args->self_vmas)));

	/* Shift private vma-s to the left */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->end >= TASK_SIZE)
			continue;

		if (vma_entry->start > vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/* Shift private vma-s to the right */
	for (vma_entry = args->tgt_vmas + args->nr_vmas -1;
				vma_entry >= args->tgt_vmas; vma_entry--) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (!vma_priv(vma_entry))
			continue;

		if (vma_entry->start > TASK_SIZE)
			continue;

		if (vma_entry->start < vma_entry->shmid)
			break;

		if (vma_remap(vma_premmaped_start(vma_entry),
				vma_entry->start, vma_entry_len(vma_entry)))
			goto core_restore_end;
	}

	/*
	 * OK, lets try to map new one.
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
			continue;

		if (vma_priv(vma_entry))
			continue;

		va = restore_mapping(vma_entry);

		if (va != vma_entry->start) {
			pr_err("Can't restore %"PRIx64" mapping with %"PRIx64"\n", vma_entry->start, va);
			goto core_restore_end;
		}
	}

	/*
	 * Walk though all VMAs again to drop PROT_WRITE
	 * if it was not there.
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
			continue;

		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
			struct shmem_info *entry;

			entry = find_shmem(args->shmems,
						  vma_entry->shmid);
			if (entry && entry->pid == my_pid &&
			    entry->start == vma_entry->start)
				futex_set_and_wake(&entry->lock, 1);
		}

		if (vma_entry->prot & PROT_WRITE)
			continue;

		sys_mprotect(decode_pointer(vma_entry->start),
			     vma_entry_len(vma_entry),
			     vma_entry->prot);
	}

	/*
	 * Finally restore madivse() bits
	 */
	for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
		unsigned long i;

		if (!vma_entry->has_madv || !vma_entry->madv)
			continue;
		for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) {
			if (vma_entry->madv & (1ul << i)) {
				ret = sys_madvise(vma_entry->start,
						  vma_entry_len(vma_entry),
						  i);
				if (ret) {
					pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
					       "failed with %ld\n",
						vma_entry->start,
						vma_entry_len(vma_entry),
						i, ret);
					goto core_restore_end;
				}
			}
		}
	}

	sys_munmap(args->tgt_vmas,
			((void *)(vma_entry + 1) - ((void *)args->tgt_vmas)));

	ret = sys_munmap(args->shmems, SHMEMS_SIZE);
	if (ret < 0) {
		pr_err("Can't unmap shmem %ld\n", ret);
		goto core_restore_end;
	}

	/*
	 * Tune up the task fields.
	 */
	ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);

	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE,	(long)args->mm.mm_start_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE,	(long)args->mm.mm_end_code, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA,	(long)args->mm.mm_start_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA,	(long)args->mm.mm_end_data, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK,	(long)args->mm.mm_start_stack, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK,	(long)args->mm.mm_start_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK,		(long)args->mm.mm_brk, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START,	(long)args->mm.mm_arg_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END,	(long)args->mm.mm_arg_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START,	(long)args->mm.mm_env_start, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END,	(long)args->mm.mm_env_end, 0);
	ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV,	(long)args->mm_saved_auxv, args->mm_saved_auxv_size);
	if (ret)
		goto core_restore_end;

	/*
	 * Because of requirements applied from kernel side
	 * we need to restore /proc/pid/exe symlink late,
	 * after old existing VMAs are superseded with
	 * new ones from image file.
	 */
	ret = restore_self_exe_late(args);
	if (ret)
		goto core_restore_end;

	/*
	 * We need to prepare a valid sigframe here, so
	 * after sigreturn the kernel will pick up the
	 * registers from the frame, set them up and
	 * finally pass execution to the new IP.
	 */
	rt_sigframe = (void *)args->t->mem_zone.rt_sigframe + 8;

	if (restore_thread_common(rt_sigframe, args->t))
		goto core_restore_end;

	/*
	 * Threads restoration. This requires some more comments. This
	 * restorer routine and thread restorer routine has the following
	 * memory map, prepared by a caller code.
	 *
	 * | <-- low addresses                                          high addresses --> |
	 * +-------------------------------------------------------+-----------------------+
	 * | this proc body | own stack | heap | rt_sigframe space | thread restore zone   |
	 * +-------------------------------------------------------+-----------------------+
	 *
	 * where each thread restore zone is the following
	 *
	 * | <-- low addresses                                     high addresses --> |
	 * +--------------------------------------------------------------------------+
	 * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe |
	 * +--------------------------------------------------------------------------+
	 */

	if (args->nr_threads > 1) {
		struct thread_restore_args *thread_args = args->thread_args;
		long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND	|
				   CLONE_THREAD | CLONE_SYSVSEM;
		long last_pid_len;
		long parent_tid;
		int i, fd;

		fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM);
		if (fd < 0) {
			pr_err("Can't open last_pid %d\n", fd);
			goto core_restore_end;
		}

		ret = sys_flock(fd, LOCK_EX);
		if (ret) {
			pr_err("Can't lock last_pid %d\n", fd);
			goto core_restore_end;
		}

		for (i = 0; i < args->nr_threads; i++) {
			char last_pid_buf[16], *s;

			/* skip self */
			if (thread_args[i].pid == args->t->pid)
				continue;

			mutex_lock(&args->rst_lock);

			new_sp =
				RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack,
						    sizeof(thread_args[i].mem_zone.stack));

			last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
			ret = sys_write(fd, s, last_pid_len);
			if (ret < 0) {
				pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
				goto core_restore_end;
			}

			/*
			 * To achieve functionality like libc's clone()
			 * we need a pure assembly here, because clone()'ed
			 * thread will run with own stack and we must not
			 * have any additional instructions... oh, dear...
			 */

			RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
		}

		ret = sys_flock(fd, LOCK_UN);
		if (ret) {
			pr_err("Can't unlock last_pid %ld\n", ret);
			goto core_restore_end;
		}

		sys_close(fd);
	}

	restore_rlims(args);

	pr_info("%ld: Restored\n", sys_getpid());

	futex_set(&zombies_inprogress, args->nr_zombies);

	restore_finish_stage(CR_STATE_RESTORE);

	futex_wait_while_gt(&zombies_inprogress, 0);

	sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));

	ret = restore_signals(args->siginfo, args->siginfo_nr, true);
	if (ret)
		goto core_restore_end;

	ret = restore_signals(args->t->siginfo, args->t->siginfo_nr, false);
	if (ret)
		goto core_restore_end;

	restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);

	if (args->siginfo_size) {
		ret = sys_munmap(args->siginfo, args->siginfo_size);
		if (ret < 0) {
			pr_err("Can't unmap signals %ld\n", ret);
			goto core_restore_failed;
		}
	}

	rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size);

	/* 
	 * Writing to last-pid is CAP_SYS_ADMIN protected,
	 * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
	 * thus restore* creds _after_ all of the above.
	 */

	ret = restore_creds(&args->creds);

	futex_set_and_wake(&thread_inprogress, args->nr_threads);

	restore_finish_stage(CR_STATE_RESTORE_CREDS);

	if (ret)
		BUG();

	/* Wait until children stop to use args->task_entries */
	futex_wait_while_gt(&thread_inprogress, 1);

	log_set_fd(-1);

	/*
	 * The code that prepared the itimers makes shure the
	 * code below doesn't fail due to bad timing values.
	 */

#define itimer_armed(args, i)				\
		(args->itimers[i].it_interval.tv_sec ||	\
		 args->itimers[i].it_interval.tv_usec)

	if (itimer_armed(args, 0))
		sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
	if (itimer_armed(args, 1))
		sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
	if (itimer_armed(args, 2))
		sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);

	ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE);
	if (ret < 0) {
		ret = ((long)__LINE__ << 16) | ((-ret) & 0xffff);
		goto core_restore_failed;
	}

	/*
	 * Sigframe stack.
	 */
	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;

	/*
	 * Prepare the stack and call for sigreturn,
	 * pure assembly since we don't need any additional
	 * code insns from gcc.
	 */
	ARCH_RT_SIGRETURN(new_sp);

core_restore_end:
	futex_abort_and_wake(&task_entries->nr_in_progress);
	pr_err("Restorer fail %ld\n", sys_getpid());
	sys_exit_group(1);
	return -1;

core_restore_failed:
	ARCH_FAIL_CORE_RESTORE;

	return ret;
}