int opal_crs_blcr_disable_checkpoint(void) { opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: disable_checkpoint()"); /* * Enter the BLCR Critical Section */ cr_enter_cs(client_id); return OPAL_SUCCESS; }
static void on_child_exit(int signum, siginfo_t *siginfo, void *arg) { int status; /* * if srun_cr is checkpoint/restart-ed after srun exited, * srun_pid will be the pid of the new srun. */ cr_enter_cs(cr_id); if (waitpid(srun_pid, &status, WNOHANG) == srun_pid) { verbose("srun(%d) exited, status: %d", srun_pid, status); mimic_exit(status); } kill(srun_pid, SIGKILL); cr_leave_cs(cr_id); }
int main(int argc, char **argv) { int debug_level, sig, srun_fd; struct sigaction sa; log_options_t logopt = LOG_OPTS_STDERR_ONLY; struct sockaddr_un ca; unsigned int ca_len = sizeof(ca); atexit(remove_listen_socket); /* copied from srun */ debug_level = _slurm_debug_env_val(); logopt.stderr_level += debug_level; log_init(xbasename(argv[0]), logopt, 0, NULL); if (init_srun_argv(argc, argv)) { fatal("failed to initialize arguments for running srun"); } if ((cr_id = cr_init()) < 0) { fatal("failed to initialize libcr: %s", cr_strerror(errno)); } (void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT); /* forward signals. copied from cr_restart */ sa.sa_sigaction = signal_child; sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO; sigemptyset(&sa.sa_mask); for (sig = 0; sig < _NSIG; sig ++) { if (sig == SIGSTOP || sig == SIGKILL || sig == SIGCHLD) continue; sigaction(sig, &sa, NULL); } sa.sa_sigaction = on_child_exit; sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP; sigaction(SIGCHLD, &sa, NULL); cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */ if ( fork_exec_srun() ) { fatal("failed fork/exec/wait srun"); } cr_leave_cs(cr_id); /* END CS */ while (1) { pthread_mutex_lock(&step_launch_mutex); while (step_launched) { /* just avoid busy waiting */ pthread_cond_wait(&step_launch_cond, &step_launch_mutex); } pthread_mutex_unlock(&step_launch_mutex); if (_wait_for_srun_connect() < 0) continue; cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */ srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len); if (srun_fd < 0) { /* restarted before enter CS. socket will not be restored */ if (errno == EBADF) { cr_leave_cs(cr_id); continue; } else { fatal("failed to accept socket: %m"); } } _read_info_from_srun(srun_fd); close(srun_fd); step_launched = 1; debug2("step launched"); cr_leave_cs(cr_id); /* END CS */ } return 0; }
int hypre_SMGResidual( void *residual_vdata, hypre_StructMatrix *A, hypre_StructVector *x, hypre_StructVector *b, hypre_StructVector *r ) { int ierr = 0; hypre_SMGResidualData *residual_data = (hypre_SMGResidualData *)residual_vdata; hypre_IndexRef base_stride = (residual_data -> base_stride); hypre_BoxArray *base_points = (residual_data -> base_points); hypre_ComputePkg *compute_pkg = (residual_data -> compute_pkg); hypre_CommHandle *comm_handle; hypre_BoxArrayArray *compute_box_aa; hypre_BoxArray *compute_box_a; hypre_Box *compute_box; hypre_Box *A_data_box; hypre_Box *x_data_box; hypre_Box *b_data_box; hypre_Box *r_data_box; int Ai; int xi; int bi; int ri; double *Ap; double *xp; double *bp; double *rp; hypre_Index loop_size; hypre_IndexRef start; hypre_StructStencil *stencil; hypre_Index *stencil_shape; int stencil_size; int compute_i, i, j, si; int loopi, loopj, loopk; hypre_BeginTiming(residual_data -> time_index); /*----------------------------------------------------------------------- * Compute residual r = b - Ax *-----------------------------------------------------------------------*/ stencil = hypre_StructMatrixStencil(A); stencil_shape = hypre_StructStencilShape(stencil); stencil_size = hypre_StructStencilSize(stencil); for (compute_i = 0; compute_i < 2; compute_i++) { switch(compute_i) { case 0: { xp = hypre_StructVectorData(x); hypre_InitializeIndtComputations(compute_pkg, xp, &comm_handle); compute_box_aa = hypre_ComputePkgIndtBoxes(compute_pkg); /*---------------------------------------- * Copy b into r *----------------------------------------*/ compute_box_a = base_points; hypre_ForBoxI(i, compute_box_a) { compute_box = hypre_BoxArrayBox(compute_box_a, i); start = hypre_BoxIMin(compute_box); b_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(b), i); r_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(r), i); bp = hypre_StructVectorBoxData(b, i); rp = hypre_StructVectorBoxData(r, i); hypre_BoxGetStrideSize(compute_box, base_stride, loop_size); hypre_BoxLoop2Begin(loop_size, b_data_box, start, base_stride, bi, r_data_box, start, base_stride, ri); #define HYPRE_BOX_SMP_PRIVATE loopk,loopi,loopj,bi,ri #include "hypre_box_smp_forloop.h" hypre_BoxLoop2For(loopi, loopj, loopk, bi, ri) { rp[ri] = bp[bi]; } hypre_BoxLoop2End(bi, ri); } } break; case 1: { hypre_FinalizeIndtComputations(comm_handle); compute_box_aa = hypre_ComputePkgDeptBoxes(compute_pkg); } break; } /*-------------------------------------------------------------------- * Compute r -= A*x *--------------------------------------------------------------------*/ hypre_ForBoxArrayI(i, compute_box_aa) { compute_box_a = hypre_BoxArrayArrayBoxArray(compute_box_aa, i); A_data_box = hypre_BoxArrayBox(hypre_StructMatrixDataSpace(A), i); x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i); r_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(r), i); rp = hypre_StructVectorBoxData(r, i); hypre_ForBoxI(j, compute_box_a) { compute_box = hypre_BoxArrayBox(compute_box_a, j); start = hypre_BoxIMin(compute_box); for (si = 0; si < stencil_size; si++) { Ap = hypre_StructMatrixBoxData(A, i, si); xp = hypre_StructVectorBoxData(x, i) + hypre_BoxOffsetDistance(x_data_box, stencil_shape[si]); hypre_BoxGetStrideSize(compute_box, base_stride, loop_size); hypre_BoxLoop3Begin(loop_size, A_data_box, start, base_stride, Ai, x_data_box, start, base_stride, xi, r_data_box, start, base_stride, ri); #if 0 /* The following portion is preprocessed to be handled by ROSE outliner */ #define HYPRE_BOX_SMP_PRIVATE loopk,loopi,loopj,Ai,xi,ri #include "hypre_box_smp_forloop.h" hypre_BoxLoop3For(loopi, loopj, loopk, Ai, xi, ri) { rp[ri] -= Ap[Ai] * xp[xi]; } hypre_BoxLoop3End(Ai, xi, ri); #else for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++) { loopi = 0; loopj = 0; loopk = 0; hypre__nx = hypre__mx; hypre__ny = hypre__my; hypre__nz = hypre__mz; if (hypre__num_blocks > 1) { if (hypre__dir == 0) { loopi = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__nx = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } else if (hypre__dir == 1) { loopj = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__ny = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } else if (hypre__dir == 2) { loopk = hypre__block * hypre__div + (((hypre__mod) < (hypre__block)) ? (hypre__mod) : (hypre__block)); hypre__nz = hypre__div + ((hypre__mod > hypre__block) ? 1 : 0); } }; Ai = hypre__i1start + loopi * hypre__sx1 + loopj * hypre__sy1 + loopk * hypre__sz1; xi = hypre__i2start + loopi * hypre__sx2 + loopj * hypre__sy2 + loopk * hypre__sz2; ri = hypre__i3start + loopi * hypre__sx3 + loopj * hypre__sy3 + loopk * hypre__sz3; //begin of the loop #if 0 // for (loopk = 0; loopk < hypre__nz; loopk++) { for (loopj = 0; loopj < hypre__ny; loopj++) { for (loopi = 0; loopi < hypre__nx; loopi++) { { rp[ri] -= Ap[Ai] * xp[xi]; } Ai += hypre__sx1; xi += hypre__sx2; ri += hypre__sx3; } Ai += hypre__sy1 - hypre__nx * hypre__sx1; xi += hypre__sy2 - hypre__nx * hypre__sx2; ri += hypre__sy3 - hypre__nx * hypre__sx3; } Ai += hypre__sz1 - hypre__ny * hypre__sy1; xi += hypre__sz2 - hypre__ny * hypre__sy2; ri += hypre__sz3 - hypre__ny * hypre__sy3; } // end of the loop #else #if BLCR_CHECKPOINTING // Only checkpoint it at the first occurrance. if (g_checkpoint_flag == 0) { int err; cr_checkpoint_args_t cr_args; cr_checkpoint_handle_t cr_handle; cr_initialize_checkpoint_args_t(&cr_args); cr_args.cr_scope = CR_SCOPE_PROC;// a process cr_args.cr_target = 0; //self cr_args.cr_signal = SIGKILL; // kill after checkpointing cr_args.cr_fd = open("dump.yy", O_WRONLY|O_CREAT|O_LARGEFILE, 0400); if (cr_args.cr_fd < 0) { printf("Error: cannot open file for checkpoiting context\n"); abort(); } g_checkpoint_flag ++; printf("Checkpoiting: starting here ..\n"); err = cr_request_checkpoint(&cr_args, &cr_handle); if (err < 0) { printf("cannot request checkpoining! err=%d\n",err); abort(); } // block until the request is served cr_enter_cs(cr); cr_leave_cs(cr); printf("Checkpoiting: restarting here ..\n"); } #endif OUT__1__6755__(&Ai,&xi,&ri,&Ap,&xp,&rp,&loopi,&loopj,&loopk,&hypre__sx1,&hypre__sy1,&hypre__sz1,&hypre__sx2,&hypre__sy2,&hypre__sz2,&hypre__sx3,&hypre__sy3,&hypre__sz3,&hypre__nx,&hypre__ny,&hypre__nz); #endif } };