/* * Executes all tasks which are in the r-queue at the moment of the * call. This routine is called to make sure that all tasks in the * queue are dequeued before continueing with other tasks. */ void PMR_process_r_queue(int tid, proc_t *procinfo, val_t *Wstruct, vec_t *Zstruct, tol_t *tolstruct, workQ_t *workQ, counter_t *num_left, double *work, int *iwork) { int thread_support = procinfo->thread_support; int t, num_tasks; int status; task_t *task; num_tasks = PMR_get_num_tasks(workQ->r_queue); for (t=0; t<num_tasks; t++) { task = PMR_remove_task_at_front(workQ->r_queue); if ( task != NULL ) { if (task->flag == CLUSTER_TASK_FLAG) { if (thread_support != MPI_THREAD_FUNNELED || tid == 0) { /* if MPI_THREAD_FUNNELED only tid==0 should process * these tasks, otherwise any thread can do it */ status = PMR_process_c_task((cluster_t *) task->data, tid, procinfo, Wstruct, Zstruct, tolstruct, workQ, num_left, work, iwork); if (status == C_TASK_PROCESSED) { free(task); } else { PMR_insert_task_at_back(workQ->r_queue, task); } } else { PMR_insert_task_at_back(workQ->r_queue, task); } } /* end if cluster task */ if (task->flag == REFINE_TASK_FLAG) { PMR_process_r_task((refine_t *) task->data, procinfo, Wstruct, tolstruct, work, iwork); free(task); } } /* end if task removed */ } /* end for t */ } /* end process_entire_r_queue */
/* TODO: Refactor this routine */ static inline int create_subtasks (cluster_t *cl, int tid, proc_t *procinfo, rrr_t *RRR, val_t *Wstruct, vec_t *Zstruct, workQ_t *workQ, counter_t *num_left) { /* From inputs */ int cl_begin = cl->begin; int cl_end = cl->end; int depth = cl->depth; int bl_begin = cl->bl_begin; int bl_end = cl->bl_end; int bl_size = bl_end - bl_begin + 1; double bl_spdiam = cl->bl_spdiam; double lgap; int pid = procinfo->pid; int nproc = procinfo->nproc; int nthreads = procinfo->nthreads; bool proc_involved=true; double *restrict Wgap = Wstruct->Wgap; double *restrict Wshifted = Wstruct->Wshifted; int *restrict iproc = Wstruct->iproc; int ldz = Zstruct->ldz; double *restrict Z = Zstruct->Z; int *restrict Zindex = Zstruct->Zindex; /* others */ int i, l, k; int max_size; task_t *task; bool task_inserted; int new_first, new_last, new_size, new_ftt1, new_ftt2; int sn_first, sn_last, sn_size; rrr_t *RRR_parent; int new_lpid, new_rpid; double *restrict D_parent; double *restrict L_parent; int my_first, my_last; bool copy_parent_rrr; max_size = fmax(1, PMR_get_counter_value(num_left) / (fmin(depth+1,4)*nthreads) ); task_inserted = true; new_first = cl_begin; for (i=cl_begin; i<=cl_end; i++) { if ( i == cl_end ) new_last = i; else if ( Wgap[i] >= MIN_RELGAP*fabs(Wshifted[i]) ) new_last = i; else continue; new_size = new_last - new_first + 1; if (new_size == 1) { /* singleton was found */ if (new_first==cl_begin || task_inserted==true) { /* initialize new singleton task */ sn_first = new_first; sn_last = new_first; sn_size = 1; } else { /* extend singleton task by one */ sn_last++; sn_size++; } /* insert task if ... */ if (i==cl_end || sn_size>=max_size || Wgap[i+1] < MIN_RELGAP*fabs(Wshifted[i+1])) { /* Check if process involved in s-task */ proc_involved = false; for (k=sn_first; k<=sn_last; k++) { if (iproc[k] == pid) { proc_involved = true; break; } } if (proc_involved == false) { task_inserted = true; new_first = i + 1; continue; } /* Insert task as process is involved */ if (sn_first == cl_begin) { lgap = cl->lgap; } else { lgap = Wgap[sn_first-1]; } PMR_increment_rrr_dependencies(RRR); task = PMR_create_s_task(sn_first, sn_last, depth+1, bl_begin, bl_end, bl_spdiam, lgap, RRR); PMR_insert_task_at_back(workQ->s_queue, task); task_inserted = true; } else { task_inserted = false; } } else { /* cluster was found */ /* check if process involved in processing the new cluster */ new_lpid = nproc-1; new_rpid = -1; for (l=new_first; l<=new_last; l++) { if (iproc[l] != -1) { new_lpid = imin(new_lpid, iproc[l]); new_rpid = imax(new_rpid, iproc[l]); } } if (new_lpid > pid || new_rpid < pid) { task_inserted = true; new_first = i + 1; continue; } /* find gap to the left */ if (new_first == cl_begin) { lgap = cl->lgap; } else { lgap = Wgap[new_first - 1]; } /* determine where to store the parent rrr needed by the * cluster to find its new rrr */ my_first = imax(new_first, cl->proc_W_begin); my_last = imin(new_last, cl->proc_W_end); if ( my_first == my_last ) { /* only one eigenvalue of cluster belongs to process */ copy_parent_rrr = true; } else { /* store parent rrr in Z at column new_ftt */ copy_parent_rrr = false; } new_ftt1 = Zindex[my_first ]; new_ftt2 = Zindex[my_first + 1]; if (copy_parent_rrr == true) { /* Copy parent RRR into alloceted arrays and mark them * for freeing later */ D_parent = (double *) malloc(bl_size * sizeof(double)); assert(D_parent != NULL); L_parent = (double *) malloc(bl_size * sizeof(double)); assert(L_parent != NULL); memcpy(D_parent, RRR->D, bl_size*sizeof(double)); memcpy(L_parent, RRR->L, bl_size*sizeof(double)); RRR_parent = PMR_create_rrr(D_parent, L_parent, NULL, NULL, bl_size, depth); PMR_set_copied_parent_rrr_flag(RRR_parent, true); } else { /* copy parent RRR into Z to make cluster task independent */ memcpy(&Z[new_ftt1*ldz+bl_begin], RRR->D, bl_size*sizeof(double)); memcpy(&Z[new_ftt2*ldz+bl_begin], RRR->L, bl_size*sizeof(double)); RRR_parent = PMR_create_rrr(&Z[new_ftt1*ldz + bl_begin], &Z[new_ftt2*ldz + bl_begin], NULL, NULL, bl_size, depth); } /* Create the task for the cluster and put it in the queue */ task = PMR_create_c_task(new_first, new_last, depth+1, bl_begin, bl_end, bl_spdiam, lgap, cl->proc_W_begin, cl->proc_W_end, new_lpid, new_rpid, RRR_parent); if (new_lpid != new_rpid) PMR_insert_task_at_back(workQ->r_queue, task); else PMR_insert_task_at_back(workQ->c_queue, task); task_inserted = true; } /* if singleton or cluster found */ new_first = i + 1; } /* end i */ /* set flag in RRR that last singleton is created */ PMR_set_parent_processed_flag(RRR); /* clean up */ PMR_try_destroy_rrr(RRR); free(cl); return 0; } /* end create_subtasks */
/* * Initialize work queue by putting all tasks for the process * into the work queue. */ static int init_workQ (proc_t *procinfo, in_t *Dstruct, val_t *Wstruct, int *nzp, workQ_t *workQ) { int pid = procinfo->pid; int nproc = procinfo->nproc; int nthreads = procinfo->nthreads; double *restrict D = Dstruct->D; double *restrict L = Dstruct->E; int nsplit = Dstruct->nsplit; int *restrict isplit = Dstruct->isplit; double *restrict W = Wstruct->W; double *restrict Werr = Wstruct->Werr; double *restrict Wgap = Wstruct->Wgap; int *restrict iproc = Wstruct->iproc; double *restrict Wshifted = Wstruct->Wshifted; double *restrict gersch = Wstruct->gersch; int nz = *nzp; /* Loop over blocks */ int i, j, k, l; int ibegin = 0; for ( j=0; j<nsplit; j++ ) { int iend = isplit[j] - 1; int isize = iend - ibegin + 1; double sigma = L[iend]; /* Use Gerschgorin disks to find spectral diameter */ double gl = gersch[2*ibegin ]; double gu = gersch[2*ibegin + 1]; for (i=ibegin+1; i<iend; i++) { gl = fmin(gl, gersch[2*i ]); gu = fmax(gu, gersch[2*i + 1]); } double spdiam = gu - gl; double avggap = spdiam / (isize-1); /* Find eigenvalues in block */ int nbl = 0; int iWbegin = iend + 1; int iWend = ibegin - 1; for (i=ibegin; i<=iend; i++) { if (nbl == 0 && iproc[i] == pid) { iWbegin = i; iWend = i; nbl++; k = i+1; while (k <=iend && iproc[k] == pid) { iWend++; nbl++; k++; } /* iWend = iWbegin + nbl - 1; instead of incrementing in loop */ } } /* If no eigenvalues for process in block continue */ if (nbl == 0) { ibegin = iend + 1; continue; } /* Compute DL and DLL for later computation of singletons * (freed when all singletons of root are computed) */ double *DL = (double*)malloc(isize*sizeof(double)); assert(DL != NULL); double *DLL = (double*)malloc(isize*sizeof(double)); assert(DLL != NULL); for (i=0; i<isize-1; i++) { double tmp = D[i+ibegin]*L[i+ibegin]; DL[i] = tmp; DLL[i] = tmp*L[i+ibegin]; } rrr_t *RRR = PMR_create_rrr(&D[ibegin], &L[ibegin], DL, DLL, isize, 0); PMR_increment_rrr_dependencies(RRR); /* In W apply shift of current block to eigenvalues * to get unshifted values w.r.t. T */ for (i=ibegin; i<=iend; i++) W[i] += sigma; /* Split eigenvalues of block into singletons and clusters * and add them to process work queue */ int max_size = imax(1, nz/nthreads); bool task_inserted = false; int new_first=ibegin, new_last; int sn_first, sn_last, sn_size; for (i=ibegin; i<=iend; i++) { if (i == iend) new_last = i; else if (Wgap[i] >= MIN_RELGAP*fabs(Wshifted[i])) new_last = i; else continue; /* Skip rest if no eigenvalues of process */ if (new_first > iWend || new_last < iWbegin) { new_first = i + 1; continue; } int new_size = new_last - new_first + 1; if (new_size == 1) { /* Singleton was found */ if (new_first < iWbegin || new_first > iWend) { new_first = i + 1; continue; } else { if (new_first==iWbegin || task_inserted==true) { /* Initialize new singleton task */ sn_first = new_first; sn_last = new_first; sn_size = 1; } else { /* Extend singleton task by one */ sn_last++; sn_size++; } } /* Insert task if ... */ if (i==iWend || sn_size>=max_size || Wgap[i+1] < MIN_RELGAP*fabs(Wshifted[i+1])) { double lgap; if (sn_first == ibegin) { lgap = fmax(0.0, W[ibegin] - Werr[ibegin] - gl ); } else { lgap = Wgap[sn_first-1]; } PMR_increment_rrr_dependencies(RRR); task_t *task = PMR_create_s_task (sn_first, sn_last, 1, ibegin, iend, spdiam, lgap, RRR); PMR_insert_task_at_back(workQ->s_queue, task); task_inserted = true; } else { task_inserted = false; } } else { /* Cluster was found */ int cl_first = new_first; int cl_last = new_last; int cl_size = new_size; /* Split cluster into clusters by absolut criterion */ if (cl_size > 3) { /* Split cluster to smaller clusters [cl_first:cl_last] */ for (k=new_first+1; k<new_last; k++) { if (k == new_last-1) cl_last = new_last; else if (k != cl_first && Wgap[k] > 0.8*avggap) cl_last = k; else continue; /* Skip cluster if no eigenvalues of process in it */ if (cl_last < iWbegin || cl_first > iWend) { cl_first = k + 1; continue; } /* Record left gap of cluster */ double lgap; if (cl_first == ibegin) { lgap = fmax(0.0, W[ibegin] - Werr[ibegin] - gl); } else { lgap = Wgap[cl_first-1]; } /* Determine processes involved in processing the cluster */ int left_pid = nproc-1; int right_pid = 0; for (l=cl_first; l<=cl_last; l++) { if (iproc[l] != -1) { left_pid = imin(left_pid, iproc[l]); right_pid = imax(right_pid, iproc[l]); } } rrr_t *RRR_parent = PMR_create_rrr(&D[ibegin], &L[ibegin], NULL, NULL, isize, 0); task_t *task = PMR_create_c_task (cl_first, cl_last, 1, ibegin, iend, spdiam, lgap, iWbegin, iWend, left_pid, right_pid, RRR_parent); /* Insert task into queue, depending if cluster need * communication with other processes */ if (left_pid != right_pid) PMR_insert_task_at_back(workQ->r_queue, task); else PMR_insert_task_at_back(workQ->c_queue, task); cl_first = k + 1; } /* end k */ } else { /* Cluster is too small to split, so insert it to queue */ /* Record left gap of cluster */ double lgap; if (cl_first == ibegin) { lgap = fmax(0.0, W[ibegin] - Werr[ibegin] - gl ); } else { lgap = Wgap[cl_first-1]; } /* Determine processes involved */ int left_pid = nproc-1; int right_pid = 0; for (l=cl_first; l<=cl_last; l++) { if (iproc[l] != -1) { left_pid = imin(left_pid, iproc[l]); right_pid = imax(right_pid, iproc[l]); } } rrr_t *RRR_parent = PMR_create_rrr (&D[ibegin], &L[ibegin], NULL, NULL, isize, 0); task_t *task = PMR_create_c_task (cl_first, cl_last, 1, ibegin, iend, spdiam, lgap, iWbegin, iWend, left_pid, right_pid, RRR_parent); /* Insert task into queue, depending if cluster need * communication with other processes */ if (left_pid != right_pid) PMR_insert_task_at_back(workQ->r_queue, task); else PMR_insert_task_at_back(workQ->c_queue, task); } task_inserted = true; } /* end new_size */ new_first = i + 1; } /* end of splitting eigenvalues into tasks */ /* Set flag in RRR that last singleton is created */ PMR_set_parent_processed_flag(RRR); PMR_try_destroy_rrr(RRR); ibegin = iend + 1; } /* end loop over blocks */ return 0; }
/* * Refine eigenvalues with respect to new rrr */ static inline int refine_eigvals (cluster_t *cl, int rf_begin, int rf_end, int tid, proc_t *procinfo, rrr_t *RRR, val_t *Wstruct, vec_t *Zstruct, tol_t *tolstruct, counter_t *num_left, workQ_t *workQ, double *work, int *iwork) { int rf_size = rf_end-rf_begin+1; int bl_begin = cl->bl_begin; int bl_end = cl->bl_end; int bl_size = bl_end - bl_begin + 1; double bl_spdiam = cl->bl_spdiam; double *restrict D = RRR->D; double *restrict L = RRR->L; double *restrict DLL = RRR->DLL; double *restrict W = Wstruct->W; double *restrict Werr = Wstruct->Werr; double *restrict Wgap = Wstruct->Wgap; int *restrict Windex = Wstruct->Windex; double *restrict Wshifted = Wstruct->Wshifted; double pivmin = tolstruct->pivmin; double rtol1 = tolstruct->rtol1; double rtol2 = tolstruct->rtol2; /* Determine if refinement should be split into tasks */ int left = PMR_get_counter_value(num_left); int nz = Zstruct->nz; int nthreads = procinfo->nthreads; int MIN_REFINE_CHUNK = fmax(2,nz/(4*nthreads)); int own_part = (int)fmax(ceil((double)left/nthreads),MIN_REFINE_CHUNK); int offset, i, p, q; double savegap; task_t *task; if (own_part < rf_size) { int others_part = rf_size - own_part; int num_tasks = iceil(rf_size, own_part) - 1; /* >1 */ int chunk = others_part/num_tasks; /* floor */ int ts_begin=rf_begin, ts_end; p = Windex[rf_begin]; for (i=0; i<num_tasks; i++) { ts_end = ts_begin + chunk - 1; q = p + chunk - 1; task = PMR_create_r_task (ts_begin, ts_end, D, DLL, p, q, bl_size, bl_spdiam, tid); if (ts_begin <= ts_end) PMR_insert_task_at_back(workQ->r_queue, task); else PMR_refine_sem_post(task->data); /* case chunk=0 */ ts_begin = ts_end + 1; p = q + 1; } ts_end = rf_end; q = Windex[rf_end]; offset = Windex[ts_begin] - 1; /* Call bisection routine to refine the values */ if (ts_begin <= ts_end) { int info; odrrb (&bl_size, D, DLL, &p, &q, &rtol1, &rtol2, &offset, &Wshifted[ts_begin], &Wgap[ts_begin], &Werr[ts_begin], work, iwork, &pivmin, &bl_spdiam, &bl_size, &info); assert( info == 0 ); } /* Empty "all" r-queue refine tasks before waiting */ int num_iter = PMR_get_num_tasks(workQ->r_queue); for (i=0; i<num_iter; i++) { task = PMR_remove_task_at_front(workQ->r_queue); if (task != NULL) { if (task->flag == REFINE_TASK_FLAG) { PMR_process_r_task ((refine_t*)task->data, procinfo, Wstruct, tolstruct, work, iwork); free(task); } else { PMR_insert_task_at_back(workQ->r_queue, task); } } /* if task */ } /* end for i */ /* Barrier: wait until all created tasks finished */ int count = num_tasks; while (count > 0) { while ( PMR_refine_sem_wait(task->data) != 0 ) { }; count--; } PMR_refine_sem_destroy(task->data); /* Edit right gap at splitting point */ ts_begin = rf_begin; for (i=0; i<num_tasks; i++) { ts_end = ts_begin + chunk - 1; Wgap[ts_end] = fmax(0.0, Wshifted[ts_end + 1] - Werr[ts_end + 1] - Wshifted[ts_end] - Werr[ts_end]); ts_begin = ts_end + 1; } } else { /* Refinement of cluster without creating tasks */ /* 'p' and 'q' are local (within block) indices of * the first/last eigenvalue of the cluster */ p = Windex[rf_begin]; q = Windex[rf_end]; offset = Windex[rf_begin] - 1; /* = p - 1 */ if (p == q) { savegap = Wgap[rf_begin]; Wgap[rf_begin] = 0.0; } /* Bisection routine to refine the values */ int info; odrrb (&bl_size, D, DLL, &p, &q, &rtol1, &rtol2, &offset, &Wshifted[rf_begin], &Wgap[rf_begin], &Werr[rf_begin], work, iwork, &pivmin, &bl_spdiam, &bl_size, &info); assert( info == 0 ); if (p == q) Wgap[rf_begin] = savegap; } /* end refine with or without creating tasks */ /* refined eigenvalues with all shifts applied in W */ double sigma = L[bl_size-1]; for (i=rf_begin; i<=rf_end; i++) W[i] = Wshifted[i] + sigma; return 0; } /* end refine_eigvals */
static inline void init_workQ(workQ_t *workQ, in_t *Dstruct, val_t *Wstruct) { double *restrict D = Dstruct->D; double *restrict L = Dstruct->E; int *restrict isplit = Dstruct->isplit; int m = Wstruct->m; double *vlp = Wstruct->vlp; double *restrict W = Wstruct->W; double *restrict Werr = Wstruct->Werr; int *restrict iblock = Wstruct->iblock; int j; int begin , end; int Wbegin, Wend; int nbl; double sigma; rrr_t *RRR; double lgap; task_t *task; /* For every unreducible block of the matrix create a task * and put it in the queue */ begin = 0; Wbegin = 0; for (j=0; j<Dstruct->nsplit; j++) { end = isplit[j] - 1; sigma = L[end]; Wend = Wbegin-1; while (Wend < m-1 && iblock[Wend + 1] == j+1) { Wend++; } if (Wend < Wbegin) { begin = end + 1; continue; } nbl = end - begin + 1; RRR = PMR_create_rrr(&D[begin], &L[begin], NULL, NULL, nbl, -1); if (nbl == 1) { /* To make sure that RRR is freed when s-task is processed */ PMR_increment_rrr_dependencies(RRR); PMR_set_parent_processed_flag(RRR); task = PMR_create_s_task(Wbegin, Wbegin, 1, begin, end, Wbegin, Wend, 0, 0, RRR); PMR_insert_task_at_back(workQ->s_queue, task); begin = end + 1; Wbegin = Wend + 1; continue; } lgap = fmax(0.0, (W[Wbegin]+sigma) - Werr[Wbegin] - (*vlp) ); task = PMR_create_c_task(Wbegin, Wend, 0, begin, end, Wbegin, Wend, 0, lgap, RRR); PMR_insert_task_at_back(workQ->c_queue, task); begin = end + 1; Wbegin = Wend + 1; } /* end of loop over block */ }