void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { // check the gauge fields have been created cudaGaugeField *cudaGauge = checkGauge(param); checkInvertParam(param); if (param->cuda_prec_sloppy != param->prec_precondition && param->inv_type_precondition != QUDA_INVALID_INVERTER) errorQuda("Sorry, cannot yet use different sloppy and preconditioner precisions"); verbosity = param->verbosity; bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE || param->solve_type == QUDA_NORMEQ_PC_SOLVE); bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; if (!pc_solve) param->spinorGiB *= 2; param->spinorGiB *= (param->cuda_prec == QUDA_DOUBLE_PRECISION ? sizeof(double) : sizeof(float)); if (param->preserve_source == QUDA_PRESERVE_SOURCE_NO) { param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 5 : 7)/(double)(1<<30); } else { param->spinorGiB *= (param->inv_type == QUDA_CG_INVERTER ? 8 : 9)/(double)(1<<30); } param->secs = 0; param->gflops = 0; param->iter = 0; // create the dirac operator DiracParam diracParam; createDirac(diracParam, *param, pc_solve); Dirac &dirac = *d; Dirac &diracSloppy = *dSloppy; Dirac &diracPre = *dPre; cpuColorSpinorField *h_b = NULL; cpuColorSpinorField *h_x = NULL; cudaColorSpinorField *b = NULL; cudaColorSpinorField *x = NULL; cudaColorSpinorField *in = NULL; cudaColorSpinorField *out = NULL; const int *X = cudaGauge->X(); // wrap CPU host side pointers ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); h_b = new cpuColorSpinorField(cpuParam); cpuParam.v = hp_x; h_x = new cpuColorSpinorField(cpuParam); // download source ColorSpinorParam cudaParam(cpuParam, *param); cudaParam.create = QUDA_COPY_FIELD_CREATE; b = new cudaColorSpinorField(*h_b, cudaParam); if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES) { // download initial guess x = new cudaColorSpinorField(*h_x, cudaParam); // solution } else { // zero initial guess cudaParam.create = QUDA_ZERO_FIELD_CREATE; x = new cudaColorSpinorField(cudaParam); // solution } if (param->verbosity >= QUDA_VERBOSE) { double nh_b = norm2(*h_b); double nb = norm2(*b); printfQuda("Source: CPU = %f, CUDA copy = %f\n", nh_b, nb); } tuneDirac(*param, pc_solution ? *x : x->Even()); dirac.prepare(in, out, *x, *b, param->solution_type); if (param->verbosity >= QUDA_VERBOSE) { double nin = norm2(*in); printfQuda("Prepared source = %f\n", nin); } massRescale(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, *in); switch (param->inv_type) { case QUDA_CG_INVERTER: if (param->solution_type != QUDA_MATDAG_MAT_SOLUTION && param->solution_type != QUDA_MATPCDAG_MATPC_SOLUTION) { copyCuda(*out, *in); dirac.Mdag(*in, *out); } { DiracMdagM m(dirac), mSloppy(diracSloppy); CG cg(m, mSloppy, *param); cg(*out, *in); } break; case QUDA_BICGSTAB_INVERTER: if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre); BiCGstab bicg(m, mSloppy, mPre, *param); bicg(*out, *in); copyCuda(*in, *out); } { DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre); BiCGstab bicg(m, mSloppy, mPre, *param); bicg(*out, *in); } break; case QUDA_GCR_INVERTER: if (param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { DiracMdag m(dirac), mSloppy(diracSloppy), mPre(diracPre); GCR gcr(m, mSloppy, mPre, *param); gcr(*out, *in); copyCuda(*in, *out); } { DiracM m(dirac), mSloppy(diracSloppy), mPre(diracPre); GCR gcr(m, mSloppy, mPre, *param); gcr(*out, *in); } break; default: errorQuda("Inverter type %d not implemented", param->inv_type); } if (param->verbosity >= QUDA_VERBOSE){ double nx = norm2(*x); printfQuda("Solution = %f\n",nx); } dirac.reconstruct(*x, *b, param->solution_type); x->saveCPUSpinorField(*h_x); // since this is a reference, this won't work: h_x = x; if (param->verbosity >= QUDA_VERBOSE){ double nx = norm2(*x); double nh_x = norm2(*h_x); printfQuda("Reconstructed: CUDA solution = %f, CPU copy = %f\n", nx, nh_x); } if (!param->preserve_dirac) { delete d; delete dSloppy; delete dPre; diracCreation = false; diracTune = false; } delete h_b; delete h_x; delete b; delete x; return; }
/*! * * Generic version of the multi-shift solver. Should work for * most fermions. Note, offset[0] is not folded into the mass parameter */ void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param, double* offsets, int num_offsets, double* residue_sq) { // check the gauge fields have been created cudaGaugeField *cudaGauge = checkGauge(param); checkInvertParam(param); param->num_offset = num_offsets; if (param->num_offset > QUDA_MAX_MULTI_SHIFT) errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", param->num_offset, QUDA_MAX_MULTI_SHIFT); for (int i=0; i<param->num_offset; i++) { param->offset[i] = offsets[i]; param->tol_offset[i] = residue_sq[i]; } verbosity = param->verbosity; // Are we doing a preconditioned solve */ /* What does NormEq solve mean in the shifted case? */ if (param->solve_type != QUDA_NORMEQ_PC_SOLVE && param->solve_type != QUDA_NORMEQ_SOLVE) { errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n"); } bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE); // In principle one can do a MATPC Solution for a hermitian M_pc // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution. bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION ); // No of GiB in a checkerboard of a spinor param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve // **** WARNING *** this may not match implementation... if( param->inv_type == QUDA_CG_INVERTER ) { // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30); } else { // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary // (Jegerlehner hep-lat/9612014 eq (3.13) param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30); } // Timing and FLOP counters param->secs = 0; param->gflops = 0; param->iter = 0; // Find the smallest shift and its offset. double low_offset = param->offset[0]; int low_index = 0; for (int i=1;i < param->num_offset;i++){ if (param->offset[i] < low_offset){ low_offset = param->offset[i]; low_index = i; } } // Host pointers for x, take a copy of the input host pointers void** hp_x; hp_x = new void* [ param->num_offset ]; void* hp_b = _hp_b; for(int i=0;i < param->num_offset;i++){ hp_x[i] = _hp_x[i]; } // Now shift things so that the vector with the smallest shift // is in the first position of the array if (low_index != 0){ void* tmp = hp_x[0]; hp_x[0] = hp_x[low_index] ; hp_x[low_index] = tmp; double tmp1 = param->offset[0]; param->offset[0]= param->offset[low_index]; param->offset[low_index] =tmp1; } // Create the matrix. // The way this works is that createDirac will create 'd' and 'dSloppy' // which are global. We then grab these with references... // // Balint: Isn't there a nice construction pattern we could use here? This is // expedient but yucky. DiracParam diracParam; if (param->dslash_type == QUDA_ASQTAD_DSLASH){ param->mass = sqrt(param->offset[0]/4); } createDirac(diracParam, *param, pc_solve); Dirac &dirac = *d; Dirac &diracSloppy = *dSloppy; cpuColorSpinorField *h_b = NULL; // Host RHS cpuColorSpinorField **h_x = NULL; cudaColorSpinorField *b = NULL; // Cuda RHS cudaColorSpinorField **x = NULL; // Cuda Solutions // Grab the dimension array of the input gauge field. const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ? gaugeFatPrecise->X() : gaugeFatPrecise->X(); // Wrap CPU host side pointers // // Balint: This creates a ColorSpinorParam struct, from the host data pointer, // the definitions in param, the dimensions X, and whether the solution is on // a checkerboard instruction or not. These can then be used as 'instructions' // to create the actual colorSpinorField ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); h_b = new cpuColorSpinorField(cpuParam); h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION for(int i=0; i < param->num_offset; i++) { cpuParam.v = hp_x[i]; h_x[i] = new cpuColorSpinorField(cpuParam); } // Now I need a colorSpinorParam for the device ColorSpinorParam cudaParam(cpuParam, *param); // This setting will download a host vector cudaParam.create = QUDA_COPY_FIELD_CREATE; b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it // Create the solution fields filled with zero x = new cudaColorSpinorField* [ param->num_offset ]; cudaParam.create = QUDA_ZERO_FIELD_CREATE; for(int i=0; i < param->num_offset; i++) { x[i] = new cudaColorSpinorField(cudaParam); } // Check source norms if( param->verbosity >= QUDA_VERBOSE ) { double nh_b = norm2(*h_b); double nb = norm2(*b); printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb); } // tune the Dirac Kernel tuneDirac(*param, pc_solution ? *(x[0]) : (x[0])->Even()); massRescale(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, *b); double *rescaled_shifts = new double [param->num_offset]; for(int i=0; i < param->num_offset; i++){ rescaled_shifts[i] = param->offset[i]; massRescaleCoeff(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, rescaled_shifts[i]); } { DiracMdagM m(dirac), mSloppy(diracSloppy); MultiShiftCG cg_m(m, mSloppy, *param); cg_m(x, *b); } delete [] rescaled_shifts; for(int i=0; i < param->num_offset; i++) { x[i]->saveCPUSpinorField(*h_x[i]); } for(int i=0; i < param->num_offset; i++){ delete h_x[i]; delete x[i]; } delete h_b; delete b; delete [] h_x; delete [] x; delete [] hp_x; if (!param->preserve_dirac) { delete d; d =NULL; delete dSloppy; dSloppy = NULL; delete dPre; dPre = NULL; diracCreation = false; diracTune = false; } return; }
Dirac_BFM_Wrapper* DiracBFMoperatorFactory::getDirac(const InputConfig& input) { return createDirac(input); }