static int eigval_subset_proc(proc_t *procinfo, char *range, in_t *Dstruct, double *E2, int ifirst, int ilast, tol_t *tolstruct, val_t *Wstruct, double *work, int *iwork) { /* Input parameter */ int max_nthreads = procinfo->nthreads; int n = Dstruct->n; double *restrict D = Dstruct->D; double *restrict E = Dstruct->E; int nsplit = Dstruct->nsplit; int *restrict isplit = Dstruct->isplit; int isize = ilast-ifirst+1; double *restrict W = Wstruct->W; double *restrict Werr = Wstruct->Werr; double *restrict Wgap = Wstruct->Wgap; int *restrict Windex = Wstruct->Windex; int *restrict iblock = Wstruct->iblock; double *restrict gersch = Wstruct->gersch; double pivmin = tolstruct->pivmin; double gl, gu, wl, wu; /* Tolerances */ double bsrtol, rtl; /* Multithreading */ int nthreads; int iifirst, iilast, chunk; pthread_t *threads; pthread_attr_t attr; auxarg1_t *auxarg1; auxarg2_t *auxarg2; void *status; /* Create random vector to perturb rrr, same seed */ int two_n = 2*n; int iseed[4] = {1,1,1,1}; double *randvec; /* loop over blocks */ int jbl, num_vals; int bl_begin, bl_end, bl_size; int bl_Wbegin, bl_Wend; double isleft, isright, spdiam; int i_low, i_upp, bl_m; double sigma, s1, s2; int sgndef, cnt, negcnt_lft, negcnt_rgt; double tau; /* Compute RRR */ int jtry, off_L, off_invD; double Dpivot, Dmax; bool noREP; /* Refine eigenvalues */ int off_DE2, offset; int rf_begin, rf_end; /* Others */ int IONE = 1, ITWO = 2; int info, m, i, j; double dummy, tmp, tmp1, tmp2; /* Allocate workspace */ randvec = (double *) malloc( 2*n * sizeof(double) ); assert(randvec != NULL); threads = (pthread_t *) malloc( max_nthreads * sizeof(pthread_t) ); assert(threads != NULL); if (max_nthreads > 1) { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); } /* Set tolerance parameters */ bsrtol = sqrt(DBL_EPSILON); rtl = sqrt(DBL_EPSILON); /* create random vector to perturb rrr and broadcast it */ LAPACK(dlarnv)(&ITWO, iseed, &two_n, randvec); /* compute approximations of the eigenvalues with muliple threads * equivalent to: * LAPACK(dlarrd) * ("I", "B", &n, &dummy, &dummy, &ifirst, &ilast, gersch, * &bsrtol, D, E, E2, &pivmin, &nsplit, isplit, &m, W, Werr, * &wl, &wu, iblock, Windex, work, iwork, &info); * assert(info == 0); * assert(m == ilast-ifirst+1); */ nthreads = max_nthreads; while (nthreads > 1 && isize / nthreads < 2) nthreads--; if (nthreads > 1) { /* each threads computes W[iifirst:iilast] and places them in * work[0:n-1]; the corresponding errors in work[n:2*n-1]; * the blocks they belong in iwork[0:n-1]; and their indices in * iwork[n:2*n-1]; */ iifirst = ifirst; chunk = isize / nthreads; for (i=1; i<nthreads; i++) { iilast = iifirst + chunk - 1; auxarg1 = create_auxarg1(n, D, E, E2, ifirst, ilast, iifirst, iilast, nsplit, isplit, bsrtol, pivmin, gersch, &work[0], &work[n], &iwork[n], &iwork[0]); info = pthread_create(&threads[i], &attr, eigval_subset_thread_a, (void *) auxarg1); assert(info == 0); iifirst = iilast + 1; } iilast = ilast; auxarg1 = create_auxarg1(n, D, E, E2, ifirst, ilast, iifirst, iilast, nsplit, isplit, bsrtol, pivmin, gersch, &work[0], &work[n], &iwork[n], &iwork[0]); status = eigval_subset_thread_a( (void *) auxarg1 ); assert(status == NULL); /* join threads */ for (i=1; i<nthreads; i++) { info = pthread_join(threads[i], &status); assert(info == 0 && status == NULL); } /* sort, m counts the numbers of eigenvalues computed by process */ m = 0; for (i=1; i<=nsplit; i++) { for (j=0; j<isize; j++) { if (iwork[j] == i) { W[m] = work[j]; Werr[m] = work[j+n]; iblock[m] = iwork[j]; Windex[m] = iwork[j+n]; m++; } } } } else { /* no multithreaded computation */ LAPACK(dlarrd) ("I", "B", &n, &dummy, &dummy, &ifirst, &ilast, gersch, &bsrtol, D, E, E2, &pivmin, &nsplit, isplit, &m, W, Werr, &wl, &wu, iblock, Windex, work, iwork, &info); assert(info == 0); assert(m == ilast-ifirst+1); } /* loop over unreduced blocks */ num_vals = m; m = 0; /* accumulates eigenvalues found or refined */ bl_begin = 0; bl_Wbegin = 0; for (jbl=0; jbl<nsplit; jbl++) { bl_end = isplit[jbl] - 1; bl_size = bl_end - bl_begin + 1; /* deal with 1x1 block immediately */ if (bl_size == 1) { E[bl_end] = 0.0; /* if eigenvalue part of process work */ if (iblock[bl_Wbegin] == jbl+1) { W[m] = D[bl_begin]; Werr[m] = 0.0; Werr[m] = 0.0; iblock[m] = jbl + 1; Windex[m] = 1; m++; bl_Wbegin++; } bl_begin = bl_end + 1; continue; } /* COMPUTE ROOT RRR */ /* store shift of initial RRR, here set to zero */ E[bl_end] = 0.0; /* find outer bounds GL, GU for block and spectral diameter */ gl = D[bl_begin]; gu = D[bl_begin]; for (i = bl_begin; i <= bl_end; i++) { gl = fmin(gl, gersch[2*i] ); gu = fmax(gu, gersch[2*i+1]); } spdiam = gu - gl; /* find approximation of extremal eigenvalues of the block * dlarrk computes one eigenvalue of tridiagonal matrix T * tmp1 and tmp2 one hold the eigenvalue and error, respectively */ LAPACK(dlarrk) (&bl_size, &IONE, &gl, &gu, &D[bl_begin], &E2[bl_begin], &pivmin, &rtl, &tmp1, &tmp2, &info); assert(info == 0); /* if info=-1 => eigenvalue did not converge */ isleft = fmax(gl, tmp1-tmp2 - HUNDRED*DBL_EPSILON*fabs(tmp1-tmp2) ); LAPACK(dlarrk) (&bl_size, &bl_size, &gl, &gu, &D[bl_begin], &E2[bl_begin], &pivmin, &rtl, &tmp1, &tmp2, &info); assert(info == 0); /* if info=-1 => eigenvalue did not converge */ isright = fmin(gu, tmp1+tmp2 + HUNDRED*DBL_EPSILON*fabs(tmp1+tmp2) ); spdiam = isright - isleft; /* compute negcount at points s1 and s2 */ s1 = isleft + HALF * spdiam; s2 = isright - FOURTH * spdiam; /* not needed currently */ /* compute negcount at points s1 and s2 */ /* cnt = number of eigenvalues in (s1,s2] = count_right - count_left * negcnt_lft = number of eigenvalues smaller equals than s1 * negcnt_rgt = number of eigenvalues smaller equals than s2 */ LAPACK(dlarrc) ("T", &bl_size, &s1, &s2, &D[bl_begin], &E[bl_begin], &pivmin, &cnt, &negcnt_lft, &negcnt_rgt, &info); assert(info == 0); /* if more of the desired eigenvectors are in the left part shift left * and the other way around */ if ( negcnt_lft >= bl_size - negcnt_lft ) { /* shift left */ sigma = isleft; sgndef = ONE; } else { /* shift right */ sigma = isright; sgndef = -ONE; } /* define increment to perturb initial shift to find RRR * with not too much element growth */ tau = spdiam*DBL_EPSILON*n + 2.0*pivmin; /* try to find initial RRR of block: * need work space of 3*n here to store D, L, D^-1 of possible * representation: * D_try = work[0 : n-1] * L_try = work[n :2*n-1] * inv(D_try) = work[2*n:3*n-1] */ off_L = n; off_invD = 2*n; for (jtry = 0; jtry < MAX_TRY_RRRR; jtry++) { Dpivot = D[bl_begin] - sigma; work[0] = Dpivot; Dmax = fabs( work[0] ); j = bl_begin; for (i = 0; i < bl_size-1; i++) { work[i+off_invD] = 1.0 / work[i]; tmp = E[j] * work[i+off_invD]; work[i+off_L] = tmp; Dpivot = (D[j+1] - sigma) - tmp*E[j]; work[i+1] = Dpivot; Dmax = fmax(Dmax, fabs(Dpivot) ); j++; } /* except representation only if not too much element growth */ if (Dmax > MAX_GROWTH*spdiam) { noREP = true; } else { noREP = false; } if (noREP == true) { /* if all eigenvalues are desired shift is made definite to use DQDS * so we should not end here */ if (jtry == MAX_TRY_RRRR-2) { if (sgndef == ONE) { /* floating point comparison okay here */ sigma = gl - FUDGE_FACTOR*spdiam*DBL_EPSILON*n - FUDGE_FACTOR*2.0*pivmin; } else { sigma = gu + FUDGE_FACTOR*spdiam*DBL_EPSILON*n + FUDGE_FACTOR*2.0*pivmin; } } else if (jtry == MAX_TRY_RRRR-1) { fprintf(stderr,"No initial representation could be found.\n"); exit(3); } else { sigma -= sgndef*tau; tau *= 2.0; continue; } } else { /* found representation */ break; } } /* end trying to find initial RRR of block */ /* save initial RRR and corresponding shift */ E[bl_end] = sigma; memcpy(&D[bl_begin], &work[0], bl_size * sizeof(double) ); memcpy(&E[bl_begin], &work[n], (bl_size-1) * sizeof(double) ); /* work[0:4*n-1] can now be used again for anything */ /* perturb root rrr by small relative amount, first make sure * that at least two values are actually disturbed enough, * which might not be necessary */ while( fabs(randvec[bl_begin])*RAND_FACTOR < 1.0 ) randvec[bl_begin] *= 2.0; while( fabs(randvec[bl_end]) *RAND_FACTOR < 1.0 ) randvec[bl_end] *= 2.0; for (i=bl_begin; i<bl_end; i++) { D[i] *= 1.0 + DBL_EPSILON*RAND_FACTOR*randvec[i]; E[i] *= 1.0 + DBL_EPSILON*RAND_FACTOR*randvec[i+n]; } D[bl_end] *= 1.0 + DBL_EPSILON*RAND_FACTOR*randvec[bl_end]; /* REFINE EIGENVALUES WITH REPECT TO RRR */ /* count number of eigenvalues in block and find smallest * and largest index of block */ bl_m = 0; i_low = n; i_upp = 1; for (i=bl_Wbegin; i<num_vals; i++) { if (iblock[i] == jbl+1) { bl_m++; i_low = imin(i_low, Windex[i]); i_upp = imax(i_upp, Windex[i]); } else { break; } } if (bl_m == 0) { bl_begin = bl_end + 1; continue; /* go to next block */ } /* last index of W to store eigenvalues of block */ bl_Wend = bl_Wbegin + bl_m - 1; /* calculate gaps */ for (i=bl_Wbegin; i<bl_Wend; i++) { Wgap[i] = fmax(0.0, (W[i+1] - Werr[i+1]) - (W[i] + Werr[i]) ); } Wgap[bl_Wend] = fmax(0.0, gu - (W[bl_Wend] + Werr[bl_Wend]) ); /* shift eigenvalues to be consistent with dqds * and compute eigenvalues of SHIFTED matrix */ for (i=bl_Wbegin; i<=bl_Wend; i++) { W[i] -= sigma; Werr[i] += fabs(W[i])*DBL_EPSILON; } /* work for sequential dlarrb = work[0:2*n-1] * iwork for sequential dlarrb = iwork[0:2*n-1] * DE2 = work[2*n:3*n-1] strting at bl_begin */ off_DE2 = 2*n; /* compute DE2 at store it in work[bl_begin+2*n:bl_end-1+2*n] */ for (i=bl_begin; i<bl_end; i++) { work[i+off_DE2] = D[i]*E[i]*E[i]; } nthreads = max_nthreads; while (nthreads > 1 && bl_m/nthreads < 2) { nthreads--; } if (nthreads > 1) { rf_begin = bl_Wbegin; chunk = bl_m / nthreads; for (i=1; i<nthreads; i++) { rf_end = rf_begin + chunk - 1; auxarg2 = create_auxarg2(bl_size, &D[bl_begin], &work[bl_begin+off_DE2], rf_begin, rf_end, Wstruct, tolstruct->rtol1, tolstruct->rtol2, pivmin, spdiam); info = pthread_create(&threads[i], &attr, eigval_subset_thread_r, (void *) auxarg2); assert(info == 0); rf_begin = rf_end + 1; } rf_end = bl_Wend; auxarg2 = create_auxarg2(bl_size, &D[bl_begin], &work[bl_begin+off_DE2], rf_begin, rf_end, Wstruct, tolstruct->rtol1, tolstruct->rtol2, pivmin, spdiam); status = eigval_subset_thread_r( (void *) auxarg2 ); assert(status == NULL); /* join threads */ for (i=1; i<nthreads; i++) { info = pthread_join(threads[i], &status); assert(info == 0 && status == NULL); } /* should update gaps at splitting points here, but the gaps * will be recomputed anyway */ } else { offset = i_low-1; /* refine eigenvalues found by dlarrd for i_low:i_upp */ LAPACK(dlarrb) (&bl_size, &D[bl_begin], &work[bl_begin+off_DE2], &i_low, &i_upp, &tolstruct->rtol1, &tolstruct->rtol2, &offset, &W[bl_Wbegin], &Wgap[bl_Wbegin], &Werr[bl_Wbegin], work, iwork, &pivmin, &spdiam, &bl_size, &info); assert(info == 0); /* needs work of dim(2*n) and iwork of dim(2*n) */ } /* dlarrb computes gaps correctly, but not last one; * this is ignored since the gaps are recomputed anyway */ /* this makes sure that the indices are in the right order */ for (i=i_low; i<=i_upp; i++) { Windex[m] = i; m++; } /* proceed with next block */ bl_begin = bl_end + 1; bl_Wbegin = bl_Wend + 1; } /* end of loop over unreduced blocks */ /* clean up */ free(randvec); free(threads); if (max_nthreads > 1) { pthread_attr_destroy(&attr); } return(0); }
static int eigval_refine_proc (proc_t *procinfo, int ifirst, int ilast, int n, double *D, double *E, double *E2, int *Windex, int *iblock, double *gersch, tol_t *tolstruct, double *W, double *Werr, double *Wgap, double *work, int *iwork) { /* Input parameter */ int isize = ilast-ifirst+1; double pivmin = tolstruct->pivmin; /* Multithreading */ int nthreads; int max_nthreads = procinfo->nthreads; int chunk; pthread_t *threads; pthread_attr_t attr; auxarg2_t *auxarg2; int info, i; /* Allocate space */ threads = (pthread_t *) malloc( max_nthreads * sizeof(pthread_t) ); assert(threads != NULL); int *isplit = (int *) malloc( n * sizeof(int) ); assert(isplit != NULL); /* This is an unreduced block (nsplit=1) */ isplit[0] = n; /* Prepare multi-threading */ if (max_nthreads > 1) { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); } /* find outer bounds GL, GU for block and spectral diameter */ double gl = D[0]; double gu = D[0]; for (i = 0; i < n; i++) { gl = fmin(gl, gersch[2*i] ); gu = fmax(gu, gersch[2*i+1]); } double spdiam = gu - gl; /* REFINE EIGENVALUES i_low:i_upp WITH REPECT TO RRR */ int i_low = Windex[0]; int i_upp = Windex[isize-1]; double sigma = E[n-1]; /* calculate gaps */ for (i=0; i<isize-1; i++) { Wgap[i] = fmax(0.0, (W[i+1] - Werr[i+1]) - (W[i] + Werr[i]) ); } Wgap[isize-1] = fmax(0.0, gu - (W[isize-1] + Werr[isize-1]) ); /* shift eigenvalues to be consistent with dqds * and compute eigenvalues of SHIFTED matrix */ for (i=0; i<isize; i++) { W[i] -= sigma; Werr[i] += fabs(W[i])*DBL_EPSILON; } /* work for sequential odrrb = work[0:2*n-1] * iwork for sequential odrrb = iwork[0:2*n-1] * DE2 = work[2*n:3*n-1] strting at bl_begin */ int off_DE2 = 2*n; /* compute DE2 at store it in work[bl_begin+2*n:bl_end-1+2*n] */ for (i=0; i<n; i++) { work[i+off_DE2] = D[i]*E[i]*E[i]; } nthreads = max_nthreads; while (nthreads > 1 && isize/nthreads < 2) { nthreads--; } if (nthreads > 1) { int rf_begin=0, rf_end; chunk = isize / nthreads; for (i=1; i<nthreads; i++) { rf_end = rf_begin + chunk - 1; auxarg2 = create_auxarg2(n, D, &work[off_DE2], rf_begin, rf_end, W, Werr, Wgap, Windex, tolstruct->rtol1, tolstruct->rtol2, pivmin, spdiam); info = pthread_create(&threads[i], &attr, eigval_subset_thread_r, (void *) auxarg2); assert(info == 0); rf_begin = rf_end + 1; } rf_end = isize-1; auxarg2 = create_auxarg2(n, D, &work[off_DE2], rf_begin, rf_end, W, Werr, Wgap, Windex, tolstruct->rtol1, tolstruct->rtol2, pivmin, spdiam); void *status = eigval_subset_thread_r( (void *) auxarg2 ); assert(status == NULL); /* join threads */ for (i=1; i<nthreads; i++) { info = pthread_join(threads[i], &status); assert(info == 0 && status == NULL); } /* should update gaps at splitting points here, but the gaps * will be recomputed anyway */ } else { int offset = i_low-1; /* refine eigenvalues found by odrrb for i_low:i_upp */ odrrb(&n, D, &work[off_DE2], &i_low, &i_upp, &tolstruct->rtol1, &tolstruct->rtol2, &offset, W, Wgap, Werr, work, iwork, &pivmin, &spdiam, &n, &info); assert(info == 0); /* needs work of dim(2*n) and iwork of dim(2*n) */ } /* odrrb computes gaps correctly, but not last one; * this is ignored since the gaps are recomputed anyway */ /* clean up */ free(threads); free(isplit); if (max_nthreads > 1) { pthread_attr_destroy(&attr); } return 0; }