void setauxvar(bodyptr btab, int nbody) { bodyptr bp; vector jvec; real jtot, etot, r0, r1, r; if (streq(getparam("auxvar"), "mass")) for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) Aux(bp) = mass_gsp(ggsp, absv(Pos(bp))); else if (streq(getparam("auxvar"), "rperi")) for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) { CROSSVP(jvec, Pos(bp), Vel(bp)); jtot = absv(jvec); etot = 0.5 * dotvp(Vel(bp), Vel(bp)) + Phi(bp); r0 = 0.0; r1 = absv(Pos(bp)); r = 0.5 * (r0 + r1); while ((r1 - r0) > TOL * r) { if (rsqrt(2 * (etot - phi_gsp(ggsp, r))) > jtot/r) r1 = r; else r0 = r; r = 0.5 * (r0 + r1); } Aux(bp) = r; } else error("%s: unknown auxvar option %s\n", getargv0(), getparam("auxvar")); }
// -------- begin of function Music::play ---------// // <int> songId // <int> playType 0 = non-looped, 1 = looped int Music::play(int songId, int playType) { if( !init_flag ) return 0; stop(); if( audio.wav_init_flag ) { String waveFileStr(DIR_MUSIC); waveFileStr += music_file[songId-1]; if( !DIR_MUSIC[0] || !m.is_file_exist(waveFileStr) || !audio.wav_init_flag ) return 0; if( playType & MUSIC_PLAY_LOOPED ) { AbsVolume absv(config.wav_music_volume,0); music_channel = audio.play_loop_wav(waveFileStr, absv ); } else { AbsVolume absv(config.wav_music_volume,0); music_channel = audio.play_long_wav(waveFileStr, absv ); } play_type = playType; song_id = songId; return music_channel >= 0; } return 0; }
int radrank(const void *a, const void *b) { real Ra, Rb; Ra = absv(Pos((bodyptr) a)); Rb = absv(Pos((bodyptr) b)); return (Ra < Rb ? -1 : Ra > Rb ? 1 : 0); }
int len4(a, b, c, d) { absv(a); absv(b); /* get the absolute values */ absv(c); absv(d); /* (component magnitudes) */ inorder(a, b); inorder(c, d); /* everyone has a chance to play */ inorder(a, c); inorder(b, d); /* (a,d) are big (winner, loser) */ inorder(b, c); /* playoff for 2nd and 3rd slots */ a += (25*b + 19*c + 16*d)/60; /* compute 4D approximate length */ /* a += (5*b + 4*c + 3*d)/12; .. only .1% worse; easy to eval */ a++; /* Roundoff -> underestimation */ return(a); /* omit the above one bit jitter */ }
// -------- begin of function Music::play ---------// // <int> songId // <int> playType 0 = non-looped, 1 = looped int Music::play(int songId, int playType) { if( !init_flag ) return 0; stop(); #ifdef BUNDLE // disable CD music playType &= ~MUSIC_CD_THEN_WAV & ~MUSIC_PLAY_CD; #endif if( playType & MUSIC_CD_THEN_WAV ) { return play(songId, playType & ~MUSIC_CD_THEN_WAV | MUSIC_PLAY_CD) || play(songId, playType & ~MUSIC_CD_THEN_WAV & ~MUSIC_PLAY_CD); } else if( playType & MUSIC_PLAY_CD ) { if( audio.cd_init_flag && audio.play_cd(songId +1, config.cd_music_volume) ) // skip the first data track { play_type = playType; song_id = songId; music_channel = 1; return 1; } return 0; } else { if( audio.wav_init_flag ) { String waveFileStr(DIR_MUSIC); waveFileStr += music_file[songId-1]; if( !DIR_MUSIC[0] || !misc.is_file_exist(waveFileStr) || !audio.wav_init_flag ) return 0; if( playType & MUSIC_PLAY_LOOPED ) { AbsVolume absv(config.wav_music_volume,0); music_channel = audio.play_loop_wav(waveFileStr, 0, absv ); } else { AbsVolume absv(config.wav_music_volume,0); music_channel = audio.play_long_wav(waveFileStr, absv ); } play_type = playType; song_id = songId; return music_channel != 0; } return 0; } }
int multimut (int oribase) { int mutbase; double _uni; _uni = ran1 (idum); if (oribase == 3) _uni *= mutmatrix[3][2]; else _uni *= mutmatrix[oribase][3]; for (mutbase = 0; mutbase < 4; mutbase++) { //printf ("(%f,%f)\t",_uni,mutmatrix[oribase][mutbase]); if (_uni <= mutmatrix[oribase][mutbase]) { if ((absv (mutbase - oribase)) == 2) numutstrans++; else numutstranv++; return (mutbase); } } printf ("error in multimut\n"); myassert (0); exit (-1); }
void polymodel(void) { gsl_interp_accel *pmsplacc = gsl_interp_accel_alloc(); bodyptr p; real rad, phi, vel, psi, vr, vp, a, E, J; vector rhat, vtmp, vper; for (p = btab; p < NthBody(btab, nbody); p = NextBody(p)) { rad = rad_m(xrandom(0.0, mtot)); phi = gsl_spline_eval(pmspline, (double) rad, pmsplacc); vel = pick_v(phi); psi = pick_psi(); vr = vel * rcos(psi); vp = vel * rsin(psi); Mass(p) = mtot / nbody; pickshell(rhat, NDIM, 1.0); MULVS(Pos(p), rhat, rad); pickshell(vtmp, NDIM, 1.0); a = dotvp(vtmp, rhat); MULVS(vper, rhat, - a); ADDV(vper, vper, vtmp); a = absv(vper); MULVS(vper, vper, vp / a); MULVS(Vel(p), rhat, vr); ADDV(Vel(p), Vel(p), vper); Phi(p) = phi; E = phi + 0.5 * rsqr(vel); J = rad * ABS(vp); Aux(p) = Kprime * rpow(phi1 - E, npol - 1.5) * rpow(J, 2 * mpol); } gsl_interp_accel_free(pmsplacc); }
int main(int argc, string argv[]) { stream fstr, istr, ostr; gsprof *gsp; bodyptr btab = NULL, p; int nbody; real tnow, r; string intags[MaxBodyFields]; initparam(argv, defv); layout_body(bodyfields, Precision, NDIM); fstr = stropen(getparam("gsp"), "r"); get_history(fstr); gsp = get_gsprof(fstr); istr = stropen(getparam("in"), "r"); get_history(istr); if (! get_snap(istr, &btab, &nbody, &tnow, intags, TRUE)) error("%s: snapshot input failed\n", getargv0()); if (! set_member(intags, PosTag)) error("%s: position data missing\n", getargv0()); if (streq(getparam("option"), "rho")) for (p = btab; p < NthBody(btab, nbody); p = NextBody(p)) Aux(p) = rho_gsp(gsp, absv(Pos(p))); else if (streq(getparam("option"), "drho")) for (p = btab; p < NthBody(btab, nbody); p = NextBody(p)) Aux(p) = drho_gsp(gsp, absv(Pos(p))); else if (streq(getparam("option"), "mass")) for (p = btab; p < NthBody(btab, nbody); p = NextBody(p)) Aux(p) = mass_gsp(gsp, absv(Pos(p))); else if (streq(getparam("option"), "phi")) for (p = btab; p < NthBody(btab, nbody); p = NextBody(p)) Aux(p) = phi_gsp(gsp, absv(Pos(p))); else error("%s: unknown option %s\n", getargv0(), getparam("option")); if (! strnull(getparam("out"))) { ostr = stropen(getparam("out"), "w"); put_history(ostr); put_snap(ostr, &btab, &nbody, &tnow, set_union(bodyfields, intags)); strclose(ostr); } return (0); }
// -------- begin of function Music::change_volume --------// void Music::change_volume(int vol) { if( !init_flag ) return; if( is_playing() ) { AbsVolume absv(vol,0); audio.volume_long_wav(music_channel, DsVolume(absv)); } }
void picktriad(vector x, vector y, vector z) { real a; pickshell(x, NDIM, 1.0); pickshell(z, NDIM, 1.0); CROSSVP(y, x, z); a = absv(y); DIVVS(y, y, a); CROSSVP(z, x, y); }
local void gspforces(bodyptr btab, int nbody, gsprof *gravgsp) { bodyptr bp; real r, mr3i; for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) { r = absv(Pos(bp)); Phi(bp) = phi_gsp(gravgsp, r); mr3i = mass_gsp(gravgsp, r) / rqbe(r); MULVS(Acc(bp), Pos(bp), -mr3i); } }
local void hqmforces(bodyptr btab, int nbody, real M, real a, real b, real tol) { bodyptr bp; double r, mr3i, params[4], phi0, aR0, az0, abserr[3]; static gsl_integration_workspace *wksp = NULL; gsl_function FPhi, F_aR, F_az; static double maxerr = 0.0; int stat[3]; if (a == b) { // spherical case is easy! for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) { r = absv(Pos(bp)); Phi(bp) = - M / (a + r); mr3i = M * rsqr(r / (a + r)) / rqbe(r); MULVS(Acc(bp), Pos(bp), - mr3i); } } else { // flattened case is harder if (wksp == NULL) { // on first call, initialze wksp = gsl_integration_workspace_alloc(1000); gsl_set_error_handler_off(); // handle errors below } FPhi.function = &intPhi; F_aR.function = &int_aR; F_az.function = &int_az; FPhi.params = F_aR.params = F_az.params = params; a2(params) = rsqr(a); b2(params) = rsqr(b); for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) { R(params) = rsqrt(rsqr(Pos(bp)[0]) + rsqr(Pos(bp)[1])); z(params) = Pos(bp)[2]; stat[0] = gsl_integration_qagiu(&FPhi, 0.0, tol, 0.0, 1000, wksp, &phi0, &abserr[0]); stat[1] = gsl_integration_qagiu(&F_aR, 0.0, tol, 0.0, 1000, wksp, &aR0, &abserr[1]); stat[2] = gsl_integration_qagiu(&F_az, 0.0, tol, 0.0, 1000, wksp, &az0, &abserr[2]); if (stat[0] || stat[1] || stat[2]) // any errors reported? for (int i = 0; i < 3; i++) if (stat[i] != 0 && abserr[i] > maxerr) { eprintf("[%s.hqmforces: warning: %s abserr[%d] = %g]\n", getprog(), gsl_strerror(stat[i]), i+1, abserr[i]); maxerr = abserr[i]; // adjust reporting threshold } Phi(bp) = - M * phi0; Acc(bp)[0] = - M * (Pos(bp)[0] / R(params)) * aR0; Acc(bp)[1] = - M * (Pos(bp)[1] / R(params)) * aR0; Acc(bp)[2] = - M * az0; } } }
static int crossn (double *a, double *b, double *c) { /* Local variables */ register double *d1; register double x; /* Function Body */ cross (a, b, c); x = absv (c); if (x >= 1e-30) for (d1 = c; d1 < c + 3; ) *d1++ /= x; return (0); }
// -------- begin of function Music::change_volume --------// void Music::change_volume(int vol) { if( !init_flag ) return; if( is_playing() ) { if( play_type & MUSIC_PLAY_CD ) { audio.set_cd_volume(vol); } else { AbsVolume absv(vol,0); audio.volume_long_wav(music_channel, DsVolume(absv)); } } }
void setprofile(real *prof1, real *prof2, int nprof, real rrange[], bodyptr btab, int nbody) { real logrmin, logrdif, logr; bodyptr bp; int j; logrmin = rlog10(rrange[0]); logrdif = rlog10(rrange[1] / rrange[0]); for (bp = btab; bp < NthBody(btab, nbody); bp = NextBody(bp)) { logr = rlog10(absv(Pos(bp))); j = 1 + floor(nprof * (logr - logrmin) / logrdif); j = MAX(j, 0); j = MIN(j, nprof + 1); prof1[j] += Mass(bp); prof2[j] += rsqr(Mass(bp)); } }
// treeforce: supervise force calculation. // _______________________________________ local void treeforce(void) { bodyptr p1, p2, p; real r, mr3i; p1 = bodytab + MAX(nstatic, 0); // set dynamic body range p2 = bodytab + nbody + MIN(nstatic, 0); for (p = bodytab; p < bodytab+nbody; p++) // loop over all bodies Update(p) = (testcalc ? p1 <= p && p < p2 : TRUE); // flag bodies to update maketree(bodytab, nbody); // construct tree structure gravcalc(); // compute current forces forcereport(); // print force statistics #if defined(EXTGRAV) for (p = bodytab; p < bodytab+nbody; p++) // loop over all bodies if (Update(p) && gravgsp != NULL) { // update in extern field? r = absv(Pos(p)); // get distance from origin mr3i = - mass_gsp(gravgsp, r) / (r*r*r); ADDMULVS(Acc(p), Pos(p), mr3i); // add extern acc and phi Phi(p) += phi_gsp(gravgsp, r); } #endif }
gsprof *snapgsp(bodyptr btab, int nbody, int npoint, real alpha, real beta) { gsprof *gsp; real *rtab, *dtab, *mtab, *coef, mtot; int nsamp, i, j; if (nbody % npoint != 0) error("%s: npoint must divide nbody\n", getargv0()); gsp = (gsprof *) allocate(sizeof(gsprof)); rtab = (real *) allocate(npoint * sizeof(real)); dtab = (real *) allocate(npoint * sizeof(real)); mtab = (real *) allocate(npoint * sizeof(real)); coef = (real *) allocate(3 * npoint * sizeof(real)); qsort(btab, nbody, SizeofBody, radrank); nsamp = nbody / npoint; mtot = 0.0; j = 0; for (i = 0; i < nbody; i++) { mtot += Mass(NthBody(btab, i)); if (i % nsamp == nsamp - 1) { rtab[j] = absv(Pos(NthBody(btab, i))); mtab[j] = mtot; j++; } } spline(coef, rtab, mtab, npoint); for (j = 0; j < npoint; j++) dtab[j] = spldif(rtab[j], rtab, mtab, coef, npoint) / (FOUR_PI * rsqr(rtab[j])); gsp->npoint = npoint; gsp->radius = rtab; gsp->density = dtab; gsp->alpha = alpha; gsp->beta = beta; gsp->mass = mtab; gsp->mtot = mtot; return (gsp); }
void sphrprof(void) { bodyptr *rsort; int skip, j, i; real r; if (nspheroid > 1) { rsort = (bodyptr *) allocate(nspheroid * sizeof(bodyptr)); for (i = 0; i < nspheroid; i++) rsort[i] = NthBody(spheroid, i); qsort(rsort, nspheroid, sizeof(bodyptr), rankrad); skip = (int) rceil(MAX(((real) nspheroid) / (NTAB - 1), 1.0)); msph[0] = rsph[0] = 0.0; for (j = 1; j < NTAB; j++) { i = skip * j - 1; rsph[j] = (i < nspheroid ? absv(Pos(rsort[i])) : 1.0 + rsph[j-1]); msph[j] = 0.0; } for (i = 0; i < nspheroid; i++) { j = i / skip + 1; if (j > NTAB-1) error("%s.sphrprof: table overflow: i = %d skip = %d\n", getargv0(), i, skip); msph[j] = msph[j] + Mass(rsort[i]); } for (j = 1; j < NTAB; j++) msph[j] += msph[j - 1]; } else { for (j = 0; j < NTAB; j++) { rsph[j] = 1.0 * j; msph[j] = Mass(NthBody(spheroid, 0)); } } eprintf("[%s.sphrprof: rsph = %f %f %f ... %f %f]\n", getargv0(), rsph[0], rsph[1], rsph[2], rsph[NTAB-2], rsph[NTAB-1]); spline(mcof, rsph, msph, NTAB); }
double simplify_angle_180(double angle) { double mult; if (angle > PI) { mult = angle/(2.0*PI); mult = floor(mult); angle -= mult*2.0*PI; if (angle > PI) angle -= 2.0 * PI; } else if (angle < PI) { mult = angle/(-2.0*PI); mult = floor(mult); angle += mult*2.0*PI; if (angle < -PI) angle += 2.0 * PI; } if (absv(angle+PI) < ANGLE_TOLERANCE) angle = PI; return angle; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cheevd */ int main( int argc, char** argv) { TESTING_CUDA_INIT(); cuFloatComplex *h_A, *h_R, *h_work; float *rwork, *w1, *w2; magma_int_t *iwork; float gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, info; magma_int_t ione = 1, izero = 0; magma_int_t ISEED[4] = {0,0,0,1}; const char *uplo = MagmaLowerStr; const char *jobz = MagmaVectorsStr; magma_int_t checkres; float result[3], eps = lapackf77_slamch( "E" ); if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); } else if ( strcmp("-JV", argv[i]) == 0 ) { jobz = MagmaVectorsStr; } else if ( strcmp("-JN", argv[i]) == 0 ) { jobz = MagmaNoVectorsStr; } } if (N>0) printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", (int) N); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_cheevd -N %d [-JV] [-JN]\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; if ( checkres && jobz[0] == MagmaNoVectors ) { printf( "Cannot check results when vectors are not computed (jobz='N')\n" ); checkres = false; } /* Query for workspace sizes */ cuFloatComplex aux_work[1]; float aux_rwork[1]; magma_int_t aux_iwork[1]; magma_cheevd( jobz[0], uplo[0], N, h_R, N, w1, aux_work, -1, aux_rwork, -1, aux_iwork, -1, &info ); magma_int_t lwork, lrwork, liwork; lwork = (magma_int_t) MAGMA_C_REAL( aux_work[0] ); lrwork = (magma_int_t) aux_rwork[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuFloatComplex, N*N ); TESTING_MALLOC( w1, float , N ); TESTING_MALLOC( w2, float , N ); TESTING_HOSTALLOC( h_R, cuFloatComplex, N*N ); TESTING_HOSTALLOC( h_work, cuFloatComplex, lwork ); TESTING_MALLOC( rwork, float, lrwork ); TESTING_MALLOC( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) \n"); printf("===================================\n"); for(i=0; i<8; i++){ if (argc==1){ N = size[i]; } n2 = N*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); for( int i=0; i<N; i++) { MAGMA_C_SET2REAL( h_A[i*N+i], MAGMA_C_REAL(h_A[i*N+i]) ); } lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* warm up run */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); /* query for optimal workspace sizes */ magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, -1, rwork, -1, iwork, -1, &info); int lwork_save = lwork; int lrwork_save = lrwork; int liwork_save = liwork; lwork = min( lwork, (magma_int_t) MAGMA_C_REAL( h_work[0] )); lrwork = min( lrwork, (magma_int_t) rwork[0] ); liwork = min( liwork, iwork[0] ); //printf( "lwork %d, query %d, used %d; liwork %d, query %d, used %d\n", // lwork_save, (magma_int_t) h_work[0], lwork, // liwork_save, iwork[0], liwork ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_cheevd(jobz[0], uplo[0], N, h_R, N, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; lwork = lwork_save; lrwork = lrwork_save; liwork = liwork_save; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ float temp1, temp2; cuFloatComplex *tau; lapackf77_chet21(&ione, uplo, &N, &izero, h_A, &N, w1, w1, h_R, &N, h_R, &N, tau, h_work, rwork, &result[0]); lapackf77_clacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); magma_cheevd('N', uplo[0], N, h_R, N, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / temp1; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_cheevd(jobz, uplo, &N, h_A, &N, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info); end = get_current_time(); if (info < 0) printf("Argument %d of cheevd had an illegal value.\n", (int) -info); cpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); if ( checkres ){ printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %e\n", result[0]*eps); printf("(2) | I - U'U | / N = %e\n", result[1]*eps); printf("(3) | S(w/ U)-S(w/o U)|/ |S| = %e\n\n", result[2]); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_FREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); /* Shutdown */ TESTING_CUDA_FINALIZE(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvd */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); float *h_A, *h_Ainit, *h_B, *h_Binit, *h_work; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; #endif float *w1, *w2, result; magma_int_t *iwork; float mgpu_time, gpu_time, cpu_time; /* Matrix size */ magma_int_t N=0, n2; magma_int_t info; magma_int_t ione = 1; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ISEED[4] = {0,0,0,1}; magma_timestr_t start, end; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); char jobz = opts.jobz; int checkres = opts.check; char uplo = opts.uplo; magma_int_t itype = opts.itype; if ( checkres && jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); jobz = MagmaVec; } printf("using: nrgpu = %d, itype = %d, jobz = %c, uplo = %c, checkres = %d\n", (int) opts.ngpu, (int) itype, jobz, uplo, (int) checkres); printf(" N M nr GPU MGPU Time(s) \n"); printf("====================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; #if defined(PRECISION_z) || defined(PRECISION_c) magma_int_t lwork = 2*N + N*N; magma_int_t lrwork = 1 + 5*N +2*N*N; // MKL's ssygvd has a bug for small N - it looks like what is returned by a // query (consistent with LAPACK's number above) is different from a the memory // requirement ckeck (that returns info -11). The lwork increase below is needed // to pass this check. if (N<32) lwork = 34*32; #else magma_int_t lwork = 1 + 6*N + 2*N*N; #endif magma_int_t liwork = 3 + 5*N; TESTING_MALLOC_PIN( h_A, float, n2 ); TESTING_MALLOC_PIN( h_B, float, n2 ); TESTING_MALLOC_PIN( h_work, float, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, float, lrwork ); #endif TESTING_MALLOC_CPU( w1, float, N ); TESTING_MALLOC_CPU( w2, float, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); printf(" N CPU Time(s) GPU Time(s) MGPU Time(s) \n"); printf("==================================================\n"); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); magma_smake_hpd( N, h_B, N ); magma_smake_symmetric( N, h_A, N ); if((opts.warmup)||( checkres )){ TESTING_MALLOC_CPU( h_Ainit, float, n2 ); TESTING_MALLOC_CPU( h_Binit, float, n2 ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_Ainit, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_Binit, &N ); } if(opts.warmup){ // ================================================================== // Warmup using MAGMA. // ================================================================== magma_ssygvd_m( opts.ngpu, itype, jobz, uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); } // =================================================================== // Performs operation using MAGMA // =================================================================== start = get_current_time(); magma_ssygvd_m( opts.ngpu, itype, jobz, uplo, N, h_A, N, h_B, N, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); if(info != 0) printf("magma_ssygvd_m returned error %d: %s.\n", (int) info, magma_strerror( info )); mgpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif result = 1.; result /= lapackf77_slansy("1",&uplo, &N, h_Ainit, &N, rwork); result /= lapackf77_slange("1",&N , &N, h_A, &N, rwork); if (itype == 1){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_neg_one, h_Binit, &N, h_A, &N, &c_one, h_work, &N); result *= lapackf77_slange("1", &N, &N, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Binit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } else if (itype == 3){ blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Ainit, &N, h_A, &N, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_sscal(&N, &w1[i], &h_A[i*N], &ione); blasf77_ssymm("L", &uplo, &N, &N, &c_one, h_Binit, &N, h_work, &N, &c_neg_one, h_A, &N); result *= lapackf77_slange("1", &N, &N, h_A, &N, rwork)/N; } lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Ainit, &N, h_A, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_Binit, &N, h_B, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); magma_ssygvd(itype, jobz, uplo, N, h_A, N, h_B, N, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info); end = get_current_time(); if(info != 0) printf("magma_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_time = GetTimerValue(start,end)/1000.; /* ===================================================================== Performs operation using LAPACK =================================================================== */ start = get_current_time(); lapackf77_ssygvd(&itype, &jobz, &uplo, &N, h_Ainit, &N, h_Binit, &N, w2, h_work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, &info); end = get_current_time(); if (info != 0) printf("lapackf77_ssygvd returned error %d: %s.\n", (int) info, magma_strerror( info )); cpu_time = GetTimerValue(start,end)/1000.; float temp1 = 0; float temp2 = 0; for(int j=0; j<N; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } float result2 = temp2 / (((float)N)*temp1); /* ===================================================================== Print execution time =================================================================== */ printf("%5d %6.2f %6.2f %6.2f\n", (int) N, cpu_time, gpu_time, mgpu_time); printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result, (result < tol ? "" : " failed") ); printf( "(3) | D(MGPU)-D(LAPACK) |/ |D| = %8.2e%s\n\n", result2, (result2 < tolulp ? "" : " failed") ); } else { printf("%5d ------ ------ %6.2f\n", (int) N, mgpu_time); } /* Memory clean up */ TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); if((opts.warmup)||( checkres )){ TESTING_FREE_CPU( h_Ainit ); TESTING_FREE_CPU( h_Binit ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } /* Shutdown */ TESTING_FINALIZE_MGPU(); }
int main(int argc, char **argv) { int nxtarg = 0; int i; char c; char tmpstring[TMPSTRINGLEN+1]; char filename[TMPSTRINGLEN+1]; char outfilename[TMPSTRINGLEN+1]; FILE *cpudsptimefile = NULL; FILE *cpupostimefile = NULL; FILE *possynctimefile = NULL; FILE *outfile = NULL; u32 cpudsp1[3], cpudsp2[3]; // this assumes that each timecheck contains three numbers, the cpu time, the dsp time, and the subsequent cpu time u32 cpupos; u32 lastcpupos; u32 dspframetime; u32 possynctime = 0; int ngaps = 0; // the number of > 40 ms gaps int ncomputed = 0; // the number of times we used the computed timestamp instead of the sync edge time u32 tmpdiff, lastdiff, maxframetimediff = 0; while (++nxtarg < argc) { if (strcmp(argv[nxtarg], "-cd") == 0) { strcpy(filename, argv[++nxtarg]); /* open the file */ if ((cpudsptimefile = fopen(filename, "r")) == NULL) { fprintf(stderr, "Error opening %s for reading\n", filename); exit(-1); } } else if (strcmp(argv[nxtarg], "-cp") == 0) { /* open cpu pos time file */ strcpy(filename, argv[++nxtarg]); if ((cpupostimefile = fopen(filename, "r")) == NULL) { fprintf(stderr, "Error opening %s for reading\n", filename); exit(-1); } } else if (strcmp(argv[nxtarg], "-ps") == 0) { /* open pos sync times file */ strcpy(filename, argv[++nxtarg]); if ((possynctimefile = fopen(filename, "r")) == NULL) { fprintf(stderr, "Error opening %s for reading\n", filename); exit(-1); } } else if (strcmp(argv[nxtarg], "-o") == 0) { /* open the output file */ strcpy(outfilename, argv[++nxtarg]); if ((outfile = fopen(outfilename, "w")) == NULL) { fprintf(stderr, "Error opening %s for writing\n", outfilename); exit(-1); } } else { fprintf(stderr, "Usage: nspike_postimestamp -cd cpudsptimefile -cp cpupostimefile -ps possynctimesfile -o outputfile\n"); exit(1); } } /* check to make sure that all necessary arguments have been specified */ if ((cpudsptimefile == NULL) || (cpupostimefile == NULL) || (possynctimefile == NULL) || (outfile == NULL)) { fprintf(stderr, "Usage: nspike_postimestamp -cd cpudsptimefile -cp cpupostimefile -ps possynctimesfile -o outputfile\n"); exit(1); } /* write out an uncompressed header to the output file */ fprintf(outfile, "%%%%BEGINHEADER\n"); fprintf(outfile, "%% File type:\tBinary\n"); fprintf(outfile, "%% Extraction type:\tdsp position frame time stamps \n"); fprintf(outfile, "%% Fields:\t timestamp (unsigned int)\n"); fprintf(outfile, "%%%%ENDHEADER\n"); /* read past the headers of the input files */ do { fgets(tmpstring, TMPSTRINGLEN, cpudsptimefile); } while ((strncmp(tmpstring, "%%ENDHEADER", 10) != 0) && (strncmp(tmpstring, "%%ENDCONFIG", 10) != 0)); do { fgets(tmpstring, TMPSTRINGLEN, cpupostimefile); } while ((strncmp(tmpstring, "%%ENDHEADER", 10) != 0) && (strncmp(tmpstring, "%%ENDCONFIG", 10) != 0)); do { fgets(tmpstring, TMPSTRINGLEN, possynctimefile); } while ((strncmp(tmpstring, "%%ENDHEADER", 10) != 0) && (strncmp(tmpstring, "%%ENDCONFIG", 10) != 0)); if (nextvalidcpudsptime(cpudsptimefile, cpudsp1) == 0) { exit(-1); } cpudsp2[0] = 0; /* Now we go through the cpupostimefile and find the possync time that * corresponds to each frame */ lastcpupos = 0; while (!feof(cpupostimefile)) { /* read in the next cpupostimefile entry */ if (fread(&cpupos, sizeof(u32), 1, cpupostimefile) != 1) { fprintf(stderr, "Error reading cpu - position frame time from cpupostimefile at offset %ld, eof = %d\n", ftell(cpupostimefile), feof(cpupostimefile)); break; } if (lastcpupos && (cpupos - lastcpupos > 400)) { ngaps++; } lastcpupos = cpupos; /* make sure that the next cpudsp time is after the current cpu frame * time */ while ((cpudsp2[0] < cpupos) && nextvalidcpudsptime(cpudsptimefile, cpudsp2)) { /* move cpudsp2 to cpudsp1 and move on */ memcpy(cpudsp1, cpudsp2, 3 * sizeof(u32)); } /* use the closest cpudsp time to convert the current frame time to a * dsp clock time */ if (absv(cpudsp1[0] - cpupos) < absv(cpudsp2[0] - cpupos)) { dspframetime = cpupos + (cpudsp1[1] - cpudsp1[0]); } else { dspframetime = cpupos + (cpudsp2[1] - cpudsp2[0]); } /* initialize the first possync time so that we read in the correct * first value */ if (possynctime == 0) { possynctime = dspframetime; } /* find the pos sync time closest to the dspframe time and write it out * to the file */ tmpdiff = UINT_MAX; /* check to see if the possynctime is much greater than the current * dspframetime as can occur when a time offset has been added */ if (possynctime < dspframetime + 10000) { do { lastdiff = tmpdiff; /* get the timestamp */ if (fread(&possynctime, sizeof(u32), 1, possynctimefile) != 1) { fprintf(stderr, "Error reading position frame sync time from possynctime file\n"); exit(-1); } /* get the transition */ if (fread(&c, sizeof(char), 1, possynctimefile) != 1) { fprintf(stderr, "Error reading position frame sync type from possynctime file\n"); exit(-1); } } while (((tmpdiff = abs(possynctime - dspframetime)) > 50) && (tmpdiff < lastdiff)); } if (tmpdiff >= 1000) { /* we should use the computed time instead of the read in time, as * we may have missed the edge of a frame */ fwrite(&dspframetime, sizeof(u32), 1, outfile); ncomputed++; } else { fwrite(&possynctime, sizeof(u32), 1, outfile); if (tmpdiff > maxframetimediff) { maxframetimediff = tmpdiff; } } } fprintf(stderr, "Timestamp file created, %d gaps > 40 ms\nLargest discrepancy between computed and actual dsp frame times = %2.1f ms\nNumber of frames for which the computed timestamp was used = %d\n", ngaps, (float) maxframetimediff / 10, ncomputed); fclose(outfile); fclose(possynctimefile); fclose(cpudsptimefile); fclose(cpupostimefile); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *d_R, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork, lda, ldda, aux_iwork[1]; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double result[3], eps, aux_work[1]; eps = lapackf77_dlamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; lda = N; ldda = ((N + 31)/32)*32; // query for workspace sizes magma_dsyevd_gpu( opts.jobz, opts.uplo, N, NULL, ldda, NULL, NULL, lda, aux_work, -1, aux_iwork, -1, &info ); lwork = (magma_int_t) aux_work[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, N*lda ); TESTING_MALLOC_PIN( h_work, double, lwork ); TESTING_MALLOC_DEV( d_R, double, N*ldda ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); /* warm up run */ if ( opts.warmup ) { magma_dsyevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsyevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ double temp1, temp2; // tau=NULL is unused since itype=1 magma_dgetmatrix( N, N, d_R, ldda, h_R, lda ); lapackf77_dsyt21( &ione, &opts.uplo, &N, &izero, h_A, &lda, w1, h_work, h_R, &lda, h_R, &lda, NULL, h_work, &result[0] ); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); magma_dsyevd_gpu( MagmaNoVec, opts.uplo, N, d_R, ldda, w2, h_R, lda, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsyevd( &opts.jobz, &opts.uplo, &N, h_A, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e%s\n", result[0]*eps, (result[0]*eps < tol ? "" : " failed") ); printf("(2) | I - U'U | / N = %8.2e%s\n", result[1]*eps, (result[1]*eps < tol ? "" : " failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e%s\n\n", result[2] , (result[2] < tolulp ? "" : " failed") ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( d_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; magmaDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; double *rwork, *w1, *w2; double result[4] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, nb, lwork, liwork, lda, lrwork; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.; double d_neg_one = -1.; //double d_ten = 10.; //magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo)); printf(" N CPU Time (sec) GPU Time(sec)\n"); printf("======================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = N*lda; nb = magma_get_zhetrd_nb(N); lwork = 2*N*nb + N*N; lrwork = 1 + 5*N +2*N*N; liwork = 3 + 5*N; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( rwork, double, lrwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_S, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); //lapackf77_zlatms( &N, &N, "U", ISEED, "P", w1, &five, &d_ten, // &d_one, &N, &N, lapack_uplo_const(opts.uplo), h_B, &lda, h_work, &info); //lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_B, &lda); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); magma_zmake_hpd( N, h_B, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); /* warmup */ if ( opts.warmup ) { magma_zhegvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zhegvd( opts.itype, opts.jobz, opts.uplo, N, h_R, lda, h_S, lda, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvd routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | I - V V' B | / ( N ) (itype = 1,2) | B - V V' | / ( |B| N ) (itype = 3) (3) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; //magmaDoubleComplex *tau; if ( opts.itype == 1 || opts.itype == 2 ) { lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_S, &lda); blasf77_zgemm("N", "C", &N, &N, &N, &c_one, h_R, &lda, h_R, &lda, &c_zero, h_work, &N); blasf77_zhemm("R", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_work, &N, &c_one, h_S, &lda); result[1] = lapackf77_zlange("1", &N, &N, h_S, &lda, rwork) / N; } else if ( opts.itype == 3 ) { lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda); blasf77_zherk(lapack_uplo_const(opts.uplo), "N", &N, &N, &d_neg_one, h_R, &lda, &d_one, h_S, &lda); result[1] = lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_S, &lda, rwork) / N / lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_B, &lda, rwork); } result[0] = 1.; result[0] /= lapackf77_zlanhe("1", lapack_uplo_const(opts.uplo), &N, h_A, &lda, rwork); result[0] /= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork); if ( opts.itype == 1 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_neg_one, h_B, &lda, h_R, &lda, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &N, h_work, &lda, rwork)/N; } else if ( opts.itype == 2 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork)/N; } else if ( opts.itype == 3 ) { blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_A, &lda, h_R, &lda, &c_zero, h_work, &N); for(int i=0; i<N; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", lapack_uplo_const(opts.uplo), &N, &N, &c_one, h_B, &lda, h_work, &N, &c_neg_one, h_R, &lda); result[0] *= lapackf77_zlange("1", &N, &N, h_R, &lda, rwork)/N; } /* lapackf77_zhet21( &ione, lapack_uplo_const(opts.uplo), &N, &izero, h_A, &lda, w1, w1, h_R, &lda, h_R, &lda, tau, h_work, rwork, &result[0] ); */ lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &lda, h_S, &lda ); magma_zhegvd( opts.itype, MagmaNoVec, opts.uplo, N, h_R, lda, h_S, lda, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info ); if (info != 0) printf("magma_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j<N; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zhegvd( &opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), &N, h_A, &lda, h_B, &lda, w2, h_work, &lwork, rwork, &lrwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_zhegvd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if ( opts.itype==1 ) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==2 ) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } else if ( opts.itype==3 ) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed") ); } if ( opts.itype==1 || opts.itype==2 ) { printf("(2) | I - Z Z' B | / N = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } else { printf("(2) | B - Z Z' | / (|B| N) = %8.2e %s\n", result[1], (result[1] < tol ? "ok" : "failed") ); } printf( "(3) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[2], (result[2] < tolulp ? "ok" : "failed") ); status += ! (result[0] < tol && result[1] < tol && result[2] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( rwork ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhegvdx */ int main( int argc, char** argv) { //#define USE_MGPU #ifdef USE_MGPU TESTING_CUDA_INIT_MGPU(); #else TESTING_CUDA_INIT(); #endif magma_int_t nrgpu =1; cuDoubleComplex *h_A, *h_R, *h_B, *h_S, *h_work; double *rwork, *w1, *w2; magma_int_t *iwork; double gpu_time, cpu_time; magma_timestr_t start, end; /* Matrix size */ magma_int_t N=0, n2; magma_int_t size[4] = {1024,2048,4100,6001}; magma_int_t i, itype, info; magma_int_t ione = 1, izero = 0; magma_int_t five = 5; cuDoubleComplex c_zero = MAGMA_Z_ZERO; cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.; double d_neg_one = -1.; double d_ten = 10.; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t il,iu,m1,m2; double vl,vu; double fraction_ev = 0; //const char *uplo = MagmaLowerStr; char *uplo = (char*)MagmaLowerStr; //char *uplo = (char*)MagmaUpperStr; char *jobz = (char*)MagmaVectorsStr; char range = 'A'; itype = 1; magma_int_t checkres; double result[2]; int flagN = 0; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0){ N = atoi(argv[++i]); if (N>0){ printf(" testing_zhegvdx -N %d\n\n", (int) N); flagN=1; } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -N %d\n\n", (int) N); exit(1); } } if (strcmp("-ngpu", argv[i])==0){ nrgpu = atoi(argv[++i]); if (nrgpu>0){ printf(" testing_zhegvdx -ngpu %d\n\n", (int) nrgpu); } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -ngpu %d\n\n", (int) nrgpu); exit(1); } } if (strcmp("-itype", argv[i])==0){ itype = atoi(argv[++i]); if (itype>0 && itype <= 3){ printf(" testing_zhegvdx -itype %d\n\n", (int) itype); } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -itype %d\n\n", (int) itype); exit(1); } } if (strcmp("-FE", argv[i])==0){ fraction_ev = atof(argv[++i]); if (fraction_ev > 0 && fraction_ev <= 1){ printf(" testing_zhegvdx -FE %f\n\n", fraction_ev); } else { fraction_ev = 0; } } if (strcmp("-L", argv[i])==0){ uplo = (char*)MagmaLowerStr; printf(" testing_zhegvdx -L"); } if (strcmp("-U", argv[i])==0){ uplo = (char*)MagmaUpperStr; printf(" testing_zhegvdx -U"); } } } else { printf("\nUsage: \n"); printf(" testing_zhegvdx -L/U -N %d -itype %d\n\n", 1024, 1); } if(!flagN) N = size[3]; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; n2 = N * N; /* Allocate host memory for the matrix */ TESTING_MALLOC( h_A, cuDoubleComplex, n2); TESTING_MALLOC( h_B, cuDoubleComplex, n2); TESTING_MALLOC( w1, double , N); TESTING_MALLOC( w2, double , N); TESTING_HOSTALLOC(h_R, cuDoubleComplex, n2); TESTING_HOSTALLOC(h_S, cuDoubleComplex, n2); magma_int_t nb = magma_get_zhetrd_nb(N); magma_int_t lwork = magma_zbulge_get_lq2(N) + 2*N + N*N; magma_int_t lrwork = 1 + 5*N +2*N*N; magma_int_t liwork = 3 + 5*N; TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork); TESTING_HOSTALLOC( rwork, double, lrwork); TESTING_MALLOC( iwork, magma_int_t, liwork); printf(" N M GPU Time(s) \n"); printf("==========================\n"); for(i=0; i<4; i++){ if (!flagN){ N = size[i]; n2 = N*N; } if (fraction_ev == 0){ il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int)(fraction_ev*N); if (iu < 1) iu = 1; } /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); //lapackf77_zlatms( &N, &N, "U", ISEED, "P", w1, &five, &d_ten, // &d_one, &N, &N, uplo, h_B, &N, h_work, &info); //lapackf77_zlaset( "A", &N, &N, &c_zero, &c_one, h_B, &N); lapackf77_zlarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ { magma_int_t i, j; for(i=0; i<N; i++) { MAGMA_Z_SET2REAL( h_B[i*N+i], ( MAGMA_Z_REAL(h_B[i*N+i]) + 1.*N ) ); } } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); #ifdef USE_MGPU magma_zhegvdx_2stage_m(nrgpu, itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #else magma_zhegvdx_2stage(itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #endif lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ start = get_current_time(); #ifdef USE_MGPU magma_zhegvdx_2stage_m(nrgpu, itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #else magma_zhegvdx_2stage(itype, jobz[0], range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, rwork, lrwork, iwork, liwork, &info); #endif end = get_current_time(); gpu_time = GetTimerValue(start,end)/1000.; if ( checkres ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ double temp1, temp2; cuDoubleComplex *tau; result[0] = 1.; result[0] /= lapackf77_zlanhe("1",uplo, &N, h_A, &N, rwork); result[0] /= lapackf77_zlange("1",&N , &m1, h_R, &N, rwork); if (itype == 1){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (itype == 2){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (itype == 3){ blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i<m1; ++i) blasf77_zdscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_zhemm("L", uplo, &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_zlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_zhegvdx(itype, 'N', range, uplo[0], N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, h_work, lwork, rwork, lrwork, iwork, liwork, &info); temp1 = temp2 = 0; for(int j=0; j<m2; j++){ temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[1] = temp2 / temp1; } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %6.2f\n", (int) N, (int) m1, gpu_time); if ( checkres ){ printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if(itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %e\n", result[0]); else if(itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %e\n", result[0]); printf("(2) | D(w/ Z)-D(w/o Z)|/ |D| = %e\n\n", result[1]); } if (flagN) break; } cudaSetDevice(0); /* Memory clean up */ TESTING_FREE( h_A); TESTING_FREE( h_B); TESTING_FREE( w1); TESTING_FREE( w2); TESTING_HOSTFREE( rwork); TESTING_FREE( iwork); TESTING_HOSTFREE(h_work); TESTING_HOSTFREE( h_R); TESTING_HOSTFREE( h_S); /* Shutdown */ #ifdef USE_MGPU TESTING_CUDA_FINALIZE_MGPU(); #else TESTING_CUDA_FINALIZE(); #endif }
double line_segment_distance(double seg1_a[3], double seg1_b[3], double seg2_a[3], double seg2_b[3]) { double sc, sd, sn, tc, td, tn; double uu, uv, vv, uw, vw, det; double u[3], v[3], w[3]; vec_subt(u, seg1_b, seg1_a); vec_subt(v, seg2_b, seg2_a); vec_subt(w, seg1_a, seg2_a); uu = vec_dot(u, u); uv = vec_dot(u, v); vv = vec_dot(v, v); uw = vec_dot(u, w); vw = vec_dot(v, w); det = uu*vv - sqr(uv); sd = det; td = det; if (det < ALMOST_ZERO) { sn = 0.0; sd = 1.0; tn = vw; td = vv; } else { sn = uv*vw - vv*uw; tn = uu*vw - uv*uw; if (sn < 0.0) { sn = 0.0; tn = vw; td = vv; } else if (sn > sd) { sn = sd; tn = vw + uv; td = vv; } } if (tn < 0.0) { tn = 0.0; if (-uw < 0.0) { sn = 0.0; } else if (-uw > uu) { sn = sd; } else { sn = -uw; sd = uu; } } else if (tn > td) { tn = td; if (uv-uw < 0.0) { sn = 0.0; } else if (uv-uw > uu) { sn = sd; } else { sn = uv-uw; sd = uu; } } sc = (absv(sn) < ALMOST_ZERO ? 0.0 : sn / sd); tc = (absv(tn) < ALMOST_ZERO ? 0.0 : tn / td); vs_mult(u, sc); vs_mult(v, tc); vec_plus(w, w, u); vec_subt(w, w, v); return vec_dot(w, w); }
void igrfmodel(VEC *bv, double *geo, double *Nmax_loc) { /* Local variables */ register double *d1, *d2; int imax, nmax, N; double f, h[NCOEF]; /* KV */ int i, k, m; double s, x, y, z; int ihmax, ih, il; double xi[3], rq; int ihm, ilm; double srq; srq=absv(geo); //square root of position vector magnitude rq = srq*srq; //magnitude of vector N = (int) *Nmax_loc; // store the max order value in a local variable if (rq < .8) { // 0.8 * earth radius is the limit to determine the altitude to be above or below earth surface //printf ("igrf call below surface !!!\n"); } // number of harmonics depends on the distance from the earth rq = 1. / rq; //TODO static INLINE arm_status arm_sqrt_f32 srq = sqrt(rq); if (rq < 0.25) nmax = (N - 3) * 4.0 * rq + 3; else nmax = N; //what is xi? for (d1 = xi, d2 = geo; d1 < xi + 3; ) *d1++ = *d2++ * rq;//position unit vector ihmax = nmax * nmax; //total number of COEFFS including g and h coeffs imax = nmax + nmax - 2; //max index for m? il = ihmax + nmax + nmax;//max index for ? d1 = h + ihmax; //max index address for h[NCOEF] d2 = Gh + ihmax; //max index address for Gh[NCOEF] for ( ; d1 <= h + il; ) *d1++ = *d2++;//last 26 entries of h and Gh are made equal for (k = 0; k < 3; k += 2) {//runs only 2 times ... for 0 and 2 i = imax; ih = ihmax; while (i >= k) { il = ih - i - 1; f = 2. / (double) (i - k + 2); x = xi[0] * f; y = xi[1] * f; z = xi[2] * (f + f); i += -2; if (i >= 2) { for (m = 3; m <= i + 1; m += 2) { ihm = ih + m; ilm = il + m; h[ilm+1] = Gh[ilm+1]+z*h[ihm+1]+x*(h[ihm+3]-h[ihm-1])-y*(h[ihm+2]+h[ihm-2]); h[ilm] = Gh[ilm]+z*h[ihm]+x*(h[ihm+2]-h[ihm-2])+y*(h[ihm+3]+h[ihm-1]); } h[il+2] = Gh[il+2]+z*h[ih+2]+x*h[ih+4]-y*(h[ih+3]+h[ih]); h[il+1] = Gh[il+1]+z*h[ih+1]+y*h[ih+4]+x*(h[ih+3]-h[ih]); } else if (i == 0) { h[il + 2] = Gh[il+2]+z*h[ih+2]+x*h[ih+4]-y*(h[ih+3]+h[ih]); h[il+1] = Gh[il+1]+z*h[ih+1]+y*h[ih+4]+x*(h[ih+3]-h[ih]); } h[il] = Gh[il]+z*h[ih]+(x*h[ih+1]+y*h[ih+2])*2.; ih = il; } } s = h[0]*.5+(h[1]*xi[2]+h[2]*xi[0]+h[3]*xi[1])*2.; f = (rq+rq)*srq; x = f*(h[2]-s*(*(geo+0)))*1e-9; y = f*(h[3]-s*(*(geo+1)))*1e-9; z = f*(h[1]-s*(*(geo+2)))*1e-9; bv->ve[0] = x; bv->ve[1] = y; bv->ve[2] = z; /* *(bv+3) = sqrt(x*x+y*y+z*z); */ }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time /*cpu_time*/; double *h_A, *h_R, *h_B, *h_S, *h_work; double *w1, *w2, vl=0, vu=0; double result[2] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, il, iu, m1, m2, nb, lwork, liwork; double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork; #endif //double d_one = 1.; //double d_ten = 10.; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf("using: itype = %d, jobz = %s, uplo = %s, check = %d, fraction = %6.4f\n", (int) opts.itype, lapack_vec_const(opts.jobz), lapack_uplo_const(opts.uplo), (int) opts.check, opts.fraction); printf(" N M GPU Time (sec)\n"); printf("============================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; n2 = N*N; nb = magma_get_dsytrd_nb(N); #if defined(PRECISION_z) || defined(PRECISION_c) lwork = 2*N*nb + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = 1 + 6*N*nb + 2* N*N; #endif liwork = 3 + 5*N; if ( opts.fraction == 0 ) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int) (opts.fraction*N); if (iu < 1) iu = 1; } TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_B, double, n2 ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( h_S, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_MALLOC_PIN( rwork, double, lrwork); #endif /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlarnv( &ione, ISEED, &n2, h_B ); magma_dmake_hpd( N, h_B, N ); magma_dmake_symmetric( N, h_A, N ); // ================================================================== // Warmup using MAGMA // ================================================================== if(opts.warmup){ lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_dsygvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_dsygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_dsygvdx( opts.itype, opts.jobz, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = h_work + N*N; #endif double temp1, temp2; result[0] = 1.; result[0] /= lapackf77_dlansy("1", lapack_uplo_const(opts.uplo), &N, h_A, &N, rwork); result[0] /= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_dscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_dsymm("L", lapack_uplo_const(opts.uplo), &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_dlange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_dsygvdx( opts.itype, MagmaNoVec, MagmaRangeI, opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_dsygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j < m2; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[1] = temp2 / (((double)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype == 1) { printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 2) { printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } else if (opts.itype == 3) { printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e %s\n", result[0], (result[0] < tol ? "ok" : "failed")); } printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e %s\n\n", result[1], (result[1] < tolulp ? "ok" : "failed")); status += ! (result[0] < tol && result[1] < tolulp); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_S ); TESTING_FREE_PIN( h_work ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_FREE_PIN( rwork ); #endif fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ssygvdx */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time /*cpu_time*/; float *h_A, *h_R, *h_B, *h_S, *h_work; float *w1, *w2, vl=0, vu=0; float result[2] = {0}; magma_int_t *iwork; magma_int_t N, n2, info, il, iu, m1, m2, nb, lwork, liwork; float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork; #endif //float d_one = 1.; //float d_ten = 10.; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); float tolulp = opts.tolerance * lapackf77_slamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf(" N M GPU Time (sec)\n"); printf("============================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; nb = magma_get_ssytrd_nb(N); #if defined(PRECISION_z) || defined(PRECISION_c) lwork = 2*N*nb + N*N; lrwork = 1 + 5*N +2*N*N; #else lwork = 1 + 6*N*nb + 2* N*N; #endif liwork = 3 + 5*N; if ( opts.fraction == 0 ) { il = N / 10; iu = N / 5+il; } else { il = 1; iu = (int) (opts.fraction*N); if (iu < 1) iu = 1; } TESTING_MALLOC( h_A, float, n2 ); TESTING_MALLOC( h_B, float, n2 ); TESTING_MALLOC( w1, float, N ); TESTING_MALLOC( w2, float, N ); TESTING_MALLOC( iwork, magma_int_t, liwork ); TESTING_HOSTALLOC( h_R, float, n2 ); TESTING_HOSTALLOC( h_S, float, n2 ); TESTING_HOSTALLOC( h_work, float, lwork ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTALLOC( rwork, float, lrwork); #endif /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); lapackf77_slarnv( &ione, ISEED, &n2, h_B ); /* increase the diagonal */ for(int i=0; i<N; i++) { MAGMA_S_SET2REAL( h_B[i*N+i], ( MAGMA_S_REAL(h_B[i*N+i]) + 1.*N ) ); MAGMA_S_SET2REAL( h_A[i*N+i], MAGMA_S_REAL(h_A[i*N+i]) ); } // ================================================================== // Warmup using MAGMA // ================================================================== if(opts.warmup){ lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_ssygvdx( opts.itype, opts.jobz, 'I', opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_ssygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); gpu_time = magma_wtime(); magma_ssygvdx( opts.itype, opts.jobz, 'I', opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m1, w1, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_ssygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zc]hegvdx routine. A x = lambda B x is solved and the following 3 tests computed: (1) | A Z - B Z D | / ( |A||Z| N ) (itype = 1) | A B Z - Z D | / ( |A||Z| N ) (itype = 2) | B A Z - Z D | / ( |A||Z| N ) (itype = 3) (2) | S(with V) - S(w/o V) | / | S | =================================================================== */ #if defined(PRECISION_d) || defined(PRECISION_s) float *rwork = h_work + N*N; #endif float temp1, temp2; result[0] = 1.; result[0] /= lapackf77_slansy("1", &opts.uplo, &N, h_A, &N, rwork); result[0] /= lapackf77_slange("1", &N, &m1, h_R, &N, rwork); if (opts.itype == 1) { blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_neg_one, h_B, &N, h_R, &N, &c_one, h_work, &N); result[0] *= lapackf77_slange("1", &N, &m1, h_work, &N, rwork)/N; } else if (opts.itype == 2) { blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_one, h_B, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_one, h_A, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_slange("1", &N, &m1, h_R, &N, rwork)/N; } else if (opts.itype == 3) { blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_one, h_A, &N, h_R, &N, &c_zero, h_work, &N); for(int i=0; i < m1; ++i) blasf77_sscal(&N, &w1[i], &h_R[i*N], &ione); blasf77_ssymm("L", &opts.uplo, &N, &m1, &c_one, h_B, &N, h_work, &N, &c_neg_one, h_R, &N); result[0] *= lapackf77_slange("1", &N, &m1, h_R, &N, rwork)/N; } lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &N, h_R, &N ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_B, &N, h_S, &N ); magma_ssygvdx( opts.itype, 'N', 'I', opts.uplo, N, h_R, N, h_S, N, vl, vu, il, iu, &m2, w2, h_work, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, &info ); if (info != 0) printf("magma_ssygvdx returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for(int j=0; j < m2; j++) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[1] = temp2 / (((float)m2)*temp1); } /* ===================================================================== Print execution time =================================================================== */ printf("%5d %5d %7.2f\n", (int) N, (int) m1, gpu_time); if ( opts.check ) { printf("Testing the eigenvalues and eigenvectors for correctness:\n"); if (opts.itype==1) printf("(1) | A Z - B Z D | / (|A| |Z| N) = %8.2e%s\n", result[0], (result[0] < tol ? "" : " failed")); else if (opts.itype==2) printf("(1) | A B Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result[0], (result[0] < tol ? "" : " failed")); else if (opts.itype==3) printf("(1) | B A Z - Z D | / (|A| |Z| N) = %8.2e%s\n", result[0], (result[0] < tol ? "" : " failed")); printf( "(2) | D(w/ Z) - D(w/o Z) | / |D| = %8.2e%s\n\n", result[1], (result[1] < tolulp ? "" : " failed")); } TESTING_FREE( h_A ); TESTING_FREE( h_B ); TESTING_FREE( w1 ); TESTING_FREE( w2 ); #if defined(PRECISION_z) || defined(PRECISION_c) TESTING_HOSTFREE( rwork); #endif TESTING_FREE( iwork ); TESTING_HOSTFREE( h_work ); TESTING_HOSTFREE( h_R ); TESTING_HOSTFREE( h_S ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
int berank(const void *p1, const void *p2) { real e1 = 0.5 * rsqr(absv(Vel((bodyptr) p1))) + Phi((bodyptr) p1); real e2 = 0.5 * rsqr(absv(Vel((bodyptr) p2))) + Phi((bodyptr) p2); return (e1 < e2 ? -1 : e1 > e2 ? 1 : 0); }