/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_csteqr( float *d, float *d_i, float *e, float *e_i, lapack_complex_float *z, lapack_complex_float *z_i, lapack_int info, lapack_int info_i, char compz, lapack_int ldz, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < n; i++ ) { failed += compare_floats(d[i],d_i[i]); } for( i = 0; i < (n-1); i++ ) { failed += compare_floats(e[i],e_i[i]); } if( LAPACKE_lsame( compz, 'i' ) || LAPACKE_lsame( compz, 'v' ) ) { for( i = 0; i < ldz*n; i++ ) { failed += compare_complex_floats(z[i],z_i[i]); } } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_shseqr( float *h, float *h_i, float *wr, float *wr_i, float *wi, float *wi_i, float *z, float *z_i, lapack_int info, lapack_int info_i, char compz, lapack_int ldh, lapack_int ldz, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < ldh*n; i++ ) { failed += compare_floats(h[i],h_i[i]); } for( i = 0; i < n; i++ ) { failed += compare_floats(wr[i],wr_i[i]); } for( i = 0; i < n; i++ ) { failed += compare_floats(wi[i],wi_i[i]); } if( LAPACKE_lsame( compz, 'i' ) || LAPACKE_lsame( compz, 'v' ) ) { for( i = 0; i < ldz*n; i++ ) { failed += compare_floats(z[i],z_i[i]); } } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_ssbtrd( float *ab, float *ab_i, float *d, float *d_i, float *e, float *e_i, float *q, float *q_i, lapack_int info, lapack_int info_i, lapack_int ldab, lapack_int ldq, lapack_int n, char vect ) { lapack_int i; int failed = 0; for( i = 0; i < ldab*n; i++ ) { failed += compare_floats(ab[i],ab_i[i]); } for( i = 0; i < n; i++ ) { failed += compare_floats(d[i],d_i[i]); } for( i = 0; i < (n-1); i++ ) { failed += compare_floats(e[i],e_i[i]); } if( LAPACKE_lsame( vect, 'u' ) || LAPACKE_lsame( vect, 'v' ) ) { for( i = 0; i < ldq*n; i++ ) { failed += compare_floats(q[i],q_i[i]); } } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_sgebrd( float *a, float *a_i, float *d, float *d_i, float *e, float *e_i, float *tauq, float *tauq_i, float *taup, float *taup_i, lapack_int info, lapack_int info_i, lapack_int lda, lapack_int m, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < lda*n; i++ ) { failed += compare_floats(a[i],a_i[i]); } for( i = 0; i < (MIN(m,n)); i++ ) { failed += compare_floats(d[i],d_i[i]); } for( i = 0; i < (MIN(m,n)-1); i++ ) { failed += compare_floats(e[i],e_i[i]); } for( i = 0; i < (MIN(m,n)); i++ ) { failed += compare_floats(tauq[i],tauq_i[i]); } for( i = 0; i < (MIN(m,n)); i++ ) { failed += compare_floats(taup[i],taup_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_chetrd( lapack_complex_float *a, lapack_complex_float *a_i, float *d, float *d_i, float *e, float *e_i, lapack_complex_float *tau, lapack_complex_float *tau_i, lapack_int info, lapack_int info_i, lapack_int lda, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < lda*n; i++ ) { failed += compare_complex_floats(a[i],a_i[i]); } for( i = 0; i < n; i++ ) { failed += compare_floats(d[i],d_i[i]); } for( i = 0; i < (n-1); i++ ) { failed += compare_floats(e[i],e_i[i]); } for( i = 0; i < (n-1); i++ ) { failed += compare_complex_floats(tau[i],tau_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
bool Expression::equal(const Expression& expression) const { if (!compare_floats(expression.getValue(), getValue())) { return false; } std::set<std::string> expNameSet = expression.getNameSet(); for (auto const& expName : expNameSet) { if (!compare_floats(expression.getValue(expName), getValue(expName))) { return false; } } return true; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_stprfs( float *ferr, float *ferr_i, float *berr, float *berr_i, lapack_int info, lapack_int info_i, lapack_int nrhs ) { lapack_int i; int failed = 0; for( i = 0; i < nrhs; i++ ) { failed += compare_floats(ferr[i],ferr_i[i]); } for( i = 0; i < nrhs; i++ ) { failed += compare_floats(berr[i],berr_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_ctbcon( float rcond, float rcond_i, lapack_int info, lapack_int info_i ) { int failed = 0; failed += compare_floats(rcond,rcond_i); failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_cbdsqr( float *d, float *d_i, float *e, float *e_i, lapack_complex_float *vt, lapack_complex_float *vt_i, lapack_complex_float *u, lapack_complex_float *u_i, lapack_complex_float *c, lapack_complex_float *c_i, lapack_int info, lapack_int info_i, lapack_int ldc, lapack_int ldu, lapack_int ldvt, lapack_int n, lapack_int ncc, lapack_int ncvt, lapack_int nru ) { lapack_int i; int failed = 0; for( i = 0; i < n; i++ ) { failed += compare_floats(d[i],d_i[i]); } for( i = 0; i < n; i++ ) { failed += compare_floats(e[i],e_i[i]); } if( ncvt != 0 ) { for( i = 0; i < ldvt*ncvt; i++ ) { failed += compare_complex_floats(vt[i],vt_i[i]); } } if( nru != 0 ) { for( i = 0; i < ldu*n; i++ ) { failed += compare_complex_floats(u[i],u_i[i]); } } if( ncc != 0 ) { for( i = 0; i < ldc*ncc; i++ ) { failed += compare_complex_floats(c[i],c_i[i]); } } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_sgeqpf( float *a, float *a_i, lapack_int *jpvt, lapack_int *jpvt_i, float *tau, float *tau_i, lapack_int info, lapack_int info_i, lapack_int lda, lapack_int m, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < lda*n; i++ ) { failed += compare_floats(a[i],a_i[i]); } for( i = 0; i < n; i++ ) { failed += (jpvt[i] == jpvt_i[i]) ? 0 : 1; } for( i = 0; i < (MIN(m,n)); i++ ) { failed += compare_floats(tau[i],tau_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_spptri( float *ap, float *ap_i, lapack_int info, lapack_int info_i, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < (n*(n+1)/2); i++ ) { failed += compare_floats(ap[i],ap_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_ssptrs( float *b, float *b_i, lapack_int info, lapack_int info_i, lapack_int ldb, lapack_int nrhs ) { lapack_int i; int failed = 0; for( i = 0; i < ldb*nrhs; i++ ) { failed += compare_floats(b[i],b_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_spotrf( float *a, float *a_i, lapack_int info, lapack_int info_i, lapack_int lda, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < lda*n; i++ ) { failed += compare_floats(a[i],a_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
/* Return value: 0 - test is passed, non-zero - test is failed */ static int compare_sormbr( float *c, float *c_i, lapack_int info, lapack_int info_i, lapack_int ldc, lapack_int n ) { lapack_int i; int failed = 0; for( i = 0; i < ldc*n; i++ ) { failed += compare_floats(c[i],c_i[i]); } failed += (info == info_i) ? 0 : 1; if( info != 0 || info_i != 0 ) { printf( "info=%d, info_i=%d\n",(int)info,(int)info_i ); } return failed; }
static void gauge_force_test(void) { int max_length = 6; initQuda(device); setVerbosityQuda(QUDA_VERBOSE,"",stdout); qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.cuda_prec_sloppy = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.reconstruct_sloppy = link_recon; qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.mom_ga_pad = 0; size_t gSize = qudaGaugeParam.cpu_prec; void* sitelink; void* sitelink_1d; #ifdef GPU_DIRECT sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize); #else sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize); #endif // this is a hack to have site link generated in 2d // then copied to 1d array in "MILC" format void* sitelink_2d[4]; #ifdef GPU_DIRECT for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #else for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #endif // fills the gauge field with random numbers createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0); //copy the 2d sitelink to 1d milc format for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V; i++){ char* src = ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } if (qudaGaugeParam.gauge_order == QUDA_MILC_GAUGE_ORDER){ sitelink = sitelink_1d; }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) { sitelink = (void**)sitelink_2d; } else { errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order); } #ifdef MULTI_GPU void* sitelink_ex_2d[4]; void* sitelink_ex_1d; sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize); for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize); int X1= Z[0]; int X2= Z[1]; int X3= Z[2]; int X4= Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ continue; } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink_2d[dir]; char* dst = (char*)sitelink_ex_2d[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V_ex; i++){ char* src = ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } #endif void* mom = safe_malloc(4*V*momSiteSize*gSize); void* refmom = safe_malloc(4*V*momSiteSize*gSize); memset(mom, 0, 4*V*momSiteSize*gSize); //initialize some data in cpuMom createMomCPU(mom, qudaGaugeParam.cpu_prec); memcpy(refmom, mom, 4*V*momSiteSize*gSize); double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)]; for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){ loop_coeff_d[i] = loop_coeff_f[i]; } void* loop_coeff; if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){ loop_coeff = (void*)&loop_coeff_f[0]; }else{ loop_coeff = loop_coeff_d; } double eb3 = 0.3; int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]); int** input_path_buf[4]; for(int dir =0; dir < 4; dir++){ input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*)); for(int i=0;i < num_paths;i++){ input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int)); if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int)); else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int)); else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int)); else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int)); } } if (tune) { printfQuda("Tuning...\n"); setTuning(QUDA_TUNE_YES); } struct timeval t0, t1; double timeinfo[3]; /* Multiple execution to exclude warmup time in the first run*/ for (int i =0;i < attempts; i++){ gettimeofday(&t0, NULL); computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &qudaGaugeParam, timeinfo); gettimeofday(&t1, NULL); } double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); //The number comes from CPU implementation in MILC, gauge_force_imp.c int flops=153004; if (verify_results){ for(int i = 0;i < attempts;i++){ #ifdef MULTI_GPU //last arg=0 means no optimization for communication, i.e. exchange data in all directions //even they are not partitioned int R[4] = {2, 2, 2, 2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #else gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #endif } int res; res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec); strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec); printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); } double perf = 1.0* flops*V/(total_time*1e+9); double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9); printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n", timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3); printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf); for(int dir = 0; dir < 4; dir++){ for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]); host_free(input_path_buf[dir]); } host_free(sitelink_1d); for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]); #ifdef MULTI_GPU host_free(sitelink_ex_1d); for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]); #endif host_free(mom); host_free(refmom); endQuda(); }
static void llfat_test(int test) { QudaGaugeParam qudaGaugeParam; #ifdef MULTI_GPU void* ghost_sitelink[4]; void* ghost_sitelink_diag[16]; #endif initQuda(device); cpu_prec = prec; gSize = cpu_prec; qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.cpu_prec = cpu_prec; qudaGaugeParam.cuda_prec = prec; qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.type=QUDA_WILSON_LINKS; qudaGaugeParam.reconstruct = link_recon; /* qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE | QUDA_FAT_PRESERVE_GPU_GAUGE | QUDA_FAT_PRESERVE_COMM_MEM; */ qudaGaugeParam.preserve_gauge =0; void* fatlink; if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for fatlink\n"); } void* longlink; if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for longlink\n"); } // page-locked memory void* sitelink[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink\n"); } } void* sitelink_ex[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n"); } } void* milc_sitelink; milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize); if(milc_sitelink == NULL){ errorQuda("ERROR: allocating milc_sitelink failed\n"); } void* milc_sitelink_ex; milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize); if(milc_sitelink_ex == NULL){ errorQuda("Error: allocating milc_sitelink failed\n"); } createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); if(gauge_order == QUDA_MILC_GAUGE_ORDER){ for(int i=0; i<V; ++i){ for(int dir=0; dir<4; ++dir){ char* src = (char*)sitelink[dir]; memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize); } } } int X1=Z[0]; int X2=Z[1]; int X3=Z[2]; int X4=Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ #ifdef MULTI_GPU continue; #endif } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink[dir]; char* dst = (char*)sitelink_ex[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); // milc ordering memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i double act_path_coeff[6]; for(int i=0;i < 6;i++){ act_path_coeff[i]= 0.1*i; } //only record the last call's performance //the first one is for creating the cpu/cuda data structures struct timeval t0, t1; void** sitelink_ptr; QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD; if(gauge_order == QUDA_QDP_GAUGE_ORDER){ sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink; }else{ sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink; } void* longlink_ptr = longlink; #ifdef MULTI_GPU if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation #endif gettimeofday(&t0, NULL); computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method); gettimeofday(&t1, NULL); double secs = TDIFF(t0,t1); void* fat_reflink[4]; void* long_reflink[4]; for(int i=0;i < 4;i++){ fat_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(fat_reflink[i] == NULL){ errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i); } long_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i); } if (verify_results){ //FIXME: we have this compplication because references takes coeff as float/double // depending on the precision while the GPU code aways take coeff as double void* coeff; double coeff_dp[6]; float coeff_sp[6]; for(int i=0;i < 6;i++){ coeff_sp[i] = coeff_dp[i] = act_path_coeff[i]; } if(prec == QUDA_DOUBLE_PRECISION){ coeff = coeff_dp; }else{ coeff = coeff_sp; } #ifdef MULTI_GPU int optflag = 0; //we need x,y,z site links in the back and forward T slice // so it is 3*2*Vs_t int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t}; for(int i=0;i < 4; i++){ ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize); if (ghost_sitelink[i] == NULL){ printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i); exit(1); } } /* nu | | |_____| mu */ for(int nu=0;nu < 4;nu++){ for(int mu=0; mu < 4;mu++){ if(nu == mu){ ghost_sitelink_diag[nu*4+mu] = NULL; }else{ //the other directions int dir1, dir2; for(dir1= 0; dir1 < 4; dir1++){ if(dir1 !=nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); if(ghost_sitelink_diag[nu*4+mu] == NULL){ errorQuda("malloc failed for ghost_sitelink_diag\n"); } memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); } } } exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag); llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff); { int R[4] = {2,2,2,2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff); } #else llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); #endif }//verify_results //format change for fatlink and longlink void* myfatlink[4]; void* mylonglink[4]; for(int i=0;i < 4;i++){ myfatlink[i] = malloc(V*gaugeSiteSize*gSize); if(myfatlink[i] == NULL){ printf("Error: malloc failed for myfatlink[%d]\n", i); exit(1); } mylonglink[i] = malloc(V*gaugeSiteSize*gSize); if(mylonglink[i] == NULL){ printf("Error: malloc failed for mylonglink[%d]\n", i); exit(1); } memset(myfatlink[i], 0, V*gaugeSiteSize*gSize); memset(mylonglink[i], 0, V*gaugeSiteSize*gSize); } for(int i=0;i < V; i++){ for(int dir=0; dir< 4; dir++){ char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize; char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize; dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); } } if (verify_results) { printfQuda("Checking fat links...\n"); int res=1; for(int dir=0; dir<4; dir++){ res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(myfatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU if(test){ #endif printfQuda("Checking long links...\n"); res = 1; for(int dir=0; dir<4; ++dir){ res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(mylonglink, "GPU results: ", long_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU }else{ // !test printfQuda("Extended volume is required for multi-GPU long-link construction\n"); } #endif } int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3]; int flops= 61632; #ifdef MULTI_GPU if(test) flops += (252*4); // long-link contribution #else flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale) #endif double perf = 1.0* flops*volume/(secs*1024*1024*1024); printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf); for(int i=0;i < 4;i++){ free(myfatlink[i]); } #ifdef MULTI_GPU if (verify_results){ int i; for(i=0;i < 4;i++){ free(ghost_sitelink[i]); } for(i=0;i <4; i++){ for(int j=0;j <4; j++){ if (i==j){ continue; } free(ghost_sitelink_diag[i*4+j]); } } } #endif for(int i=0;i < 4; i++){ cudaFreeHost(sitelink[i]); cudaFreeHost(sitelink_ex[i]); free(fat_reflink[i]); } cudaFreeHost(fatlink); cudaFreeHost(longlink); if(milc_sitelink) free(milc_sitelink); if(milc_sitelink_ex) free(milc_sitelink_ex); #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif endQuda(); }
void ColorSpaceManager::is_builtin_colorspace(ustring colorspace, bool &is_scene_linear, bool &is_srgb) { #ifdef WITH_OCIO const OCIO::Processor *processor = (const OCIO::Processor *)get_processor(colorspace); if (!processor) { is_scene_linear = false; is_srgb = false; return; } is_scene_linear = true; is_srgb = true; for (int i = 0; i < 256; i++) { float v = i / 255.0f; float cR[3] = {v, 0, 0}; float cG[3] = {0, v, 0}; float cB[3] = {0, 0, v}; float cW[3] = {v, v, v}; processor->applyRGB(cR); processor->applyRGB(cG); processor->applyRGB(cB); processor->applyRGB(cW); /* Make sure that there is no channel crosstalk. */ if (fabsf(cR[1]) > 1e-5f || fabsf(cR[2]) > 1e-5f || fabsf(cG[0]) > 1e-5f || fabsf(cG[2]) > 1e-5f || fabsf(cB[0]) > 1e-5f || fabsf(cB[1]) > 1e-5f) { is_scene_linear = false; is_srgb = false; break; } /* Make sure that the three primaries combine linearly. */ if (!compare_floats(cR[0], cW[0], 1e-6f, 64) || !compare_floats(cG[1], cW[1], 1e-6f, 64) || !compare_floats(cB[2], cW[2], 1e-6f, 64)) { is_scene_linear = false; is_srgb = false; break; } /* Make sure that the three channels behave identically. */ if (!compare_floats(cW[0], cW[1], 1e-6f, 64) || !compare_floats(cW[1], cW[2], 1e-6f, 64)) { is_scene_linear = false; is_srgb = false; break; } float out_v = average(make_float3(cW[0], cW[1], cW[2])); if (!compare_floats(v, out_v, 1e-6f, 64)) { is_scene_linear = false; } if (!compare_floats(color_srgb_to_linear(v), out_v, 1e-6f, 64)) { is_srgb = false; } } #else (void)colorspace; is_scene_linear = false; is_srgb = false; #endif }