int main(int argc,char *argv[]) { FILE *parameterfile = NULL; char datafilename[206]; char parameterfilename[206]; char conf_filename[50]; char scalar_filename[50]; char * input_filename = NULL; char * filename = NULL; double plaquette_energy; #ifdef _USE_HALFSPINOR #undef _USE_HALFSPINOR printf("# WARNING: USE_HALFSPINOR will be ignored (not supported here).\n"); #endif if(even_odd_flag) { even_odd_flag=0; printf("# WARNING: even_odd_flag will be ignored (not supported here).\n"); } int j,j_max,k,k_max = 2; _Complex double * drvsc; #ifdef HAVE_LIBLEMON paramsXlfInfo *xlfInfo; #endif int status = 0; static double t1,t2,dt,sdt,dts,qdt,sqdt; double antioptaway=0.0; #ifdef MPI static double dt2; DUM_DERI = 6; DUM_SOLVER = DUM_DERI+2; DUM_MATRIX = DUM_SOLVER+6; NO_OF_SPINORFIELDS = DUM_MATRIX+2; #ifdef OMP int mpi_thread_provided; MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); #else MPI_Init(&argc, &argv); #endif MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); #else g_proc_id = 0; #endif g_rgi_C1 = 1.; process_args(argc,argv,&input_filename,&filename); set_default_filenames(&input_filename, &filename); /* Read the input file */ if( (j = read_input(input_filename)) != 0) { fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename); exit(-1); } if(g_proc_id==0) { printf("parameter rho_BSM set to %f\n", rho_BSM); printf("parameter eta_BSM set to %f\n", eta_BSM); printf("parameter m0_BSM set to %f\n", m0_BSM); } #ifdef OMP init_openmp(); #endif tmlqcd_mpi_init(argc, argv); if(g_proc_id==0) { #ifdef SSE printf("# The code was compiled with SSE instructions\n"); #endif #ifdef SSE2 printf("# The code was compiled with SSE2 instructions\n"); #endif #ifdef SSE3 printf("# The code was compiled with SSE3 instructions\n"); #endif #ifdef P4 printf("# The code was compiled for Pentium4\n"); #endif #ifdef OPTERON printf("# The code was compiled for AMD Opteron\n"); #endif #ifdef _GAUGE_COPY printf("# The code was compiled with -D_GAUGE_COPY\n"); #endif #ifdef BGL printf("# The code was compiled for Blue Gene/L\n"); #endif #ifdef BGP printf("# The code was compiled for Blue Gene/P\n"); #endif #ifdef _USE_HALFSPINOR printf("# The code was compiled with -D_USE_HALFSPINOR\n"); #endif #ifdef _USE_SHMEM printf("# The code was compiled with -D_USE_SHMEM\n"); #ifdef _PERSISTENT printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); #endif #endif #ifdef MPI #ifdef _NON_BLOCKING printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); #endif #endif printf("\n"); fflush(stdout); } #ifdef _GAUGE_COPY init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); #else init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); #endif init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); j = init_bispinor_field(VOLUMEPLUSRAND, 12); if ( j!= 0) { fprintf(stderr, "Not enough memory for bispinor fields! Aborting...\n"); exit(0); } j = init_spinor_field(VOLUMEPLUSRAND, 12); if ( j!= 0) { fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); exit(0); } int numbScalarFields = 4; j = init_scalar_field(VOLUMEPLUSRAND, numbScalarFields); if ( j!= 0) { fprintf(stderr, "Not enough memory for scalar fields! Aborting...\n"); exit(0); } drvsc = malloc(18*VOLUMEPLUSRAND*sizeof(_Complex double)); if(g_proc_id == 0) { fprintf(stdout,"# The number of processes is %d \n",g_nproc); printf("# The lattice size is %d x %d x %d x %d\n", (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); printf("# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY),(int) LZ); fflush(stdout); } /* define the geometry */ geometry(); j = init_bsm_2hop_lookup(VOLUME); if ( j!= 0) { // this should not be reached since the init function calls fatal_error anyway fprintf(stderr, "Not enough memory for BSM2b 2hop lookup table! Aborting...\n"); exit(0); } /* define the boundary conditions for the fermion fields */ /* for the actual inversion, this is done in invert.c as the operators are iterated through */ // // For the BSM operator we don't use kappa normalisation, // as a result, when twisted boundary conditions are applied this needs to be unity. // In addition, unlike in the Wilson case, the hopping term comes with a plus sign. // However, in boundary(), the minus sign for the Wilson case is implicitly included. // We therefore use -1.0 here. boundary(-1.0); status = check_geometry(); if (status != 0) { fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); exit(1); } #if (defined MPI && !(defined _USE_SHMEM)) // fails, we're not using spinor fields // check_xchange(); #endif start_ranlux(1, 123456); // read gauge field if( strcmp(gauge_input_filename, "create_random_gaugefield") == 0 ) { random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); } else { sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); if (g_cart_id == 0) { printf("#\n# Trying to read gauge field from file %s in %s precision.\n", conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); fflush(stdout); } int i; if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) { fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename); exit(-2); } if (g_cart_id == 0) { printf("# Finished reading gauge field.\n"); fflush(stdout); } } // read scalar field if( strcmp(scalar_input_filename, "create_random_scalarfield") == 0 ) { for( int s=0; s<numbScalarFields; s++ ) ranlxd(g_scalar_field[s], VOLUME); } else { sprintf(scalar_filename, "%s.%d", scalar_input_filename, nscalar); if (g_cart_id == 0) { printf("#\n# Trying to read scalar field from file %s in %s precision.\n", scalar_filename, (scalar_precision_read_flag == 32 ? "single" : "double")); fflush(stdout); } int i; if( (i = read_scalar_field(scalar_filename,g_scalar_field)) !=0) { fprintf(stderr, "Error %d while reading scalar field from %s\n Aborting...\n", i, scalar_filename); exit(-2); } if (g_cart_id == 0) { printf("# Finished reading scalar field.\n"); fflush(stdout); } } #ifdef MPI xchange_gauge(g_gauge_field); #endif /*compute the energy of the gauge field*/ plaquette_energy = measure_plaquette( (const su3**) g_gauge_field); if (g_cart_id == 0) { printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc)); fflush(stdout); } #ifdef MPI for( int s=0; s<numbScalarFields; s++ ) generic_exchange(g_scalar_field[s], sizeof(scalar)); #endif /*initialize the bispinor fields*/ j_max=1; sdt=0.; // w random_spinor_field_lexic( (spinor*)(g_bispinor_field[4]), reproduce_randomnumber_flag, RN_GAUSS); random_spinor_field_lexic( (spinor*)(g_bispinor_field[4])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS); // for the D^\dagger test: // v random_spinor_field_lexic( (spinor*)(g_bispinor_field[5]), reproduce_randomnumber_flag, RN_GAUSS); random_spinor_field_lexic( (spinor*)(g_bispinor_field[5])+VOLUME, reproduce_randomnumber_flag, RN_GAUSS); #if defined MPI generic_exchange(g_bispinor_field[4], sizeof(bispinor)); #endif // print L2-norm of source: double squarenorm = square_norm((spinor*)g_bispinor_field[4], 2*VOLUME, 1); if(g_proc_id==0) { printf("\n# square norm of the source: ||w||^2 = %e\n\n", squarenorm); fflush(stdout); } double t_MG, t_BK; /* inversion needs to be done first because it uses loads of the g_bispinor_fields internally */ #if TEST_INVERSION if(g_proc_id==1) printf("Testing inversion\n"); // Bartek's operator t1 = gettime(); cg_her_bi(g_bispinor_field[9], g_bispinor_field[4], 25000, 1.0e-14, 0, VOLUME, &Q2_psi_BSM2b); t_BK = gettime() - t1; // Marco's operator t1 = gettime(); cg_her_bi(g_bispinor_field[8], g_bispinor_field[4], 25000, 1.0e-14, 0, VOLUME, &Q2_psi_BSM2m); t_MG = gettime() - t1; if(g_proc_id==0) printf("Operator inversion time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); #endif /* now apply the operators to the same bispinor field and do various comparisons */ // Marco's operator #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t_MG = 0.0; t1 = gettime(); D_psi_BSM2m(g_bispinor_field[0], g_bispinor_field[4]); t1 = gettime() - t1; #ifdef MPI MPI_Allreduce (&t1, &t_MG, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else t_MG = t1; #endif // Bartek's operator #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); #endif t_BK = 0.0; t1 = gettime(); D_psi_BSM2b(g_bispinor_field[1], g_bispinor_field[4]); t1 = gettime() - t1; #ifdef MPI MPI_Allreduce (&t1, &t_BK, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #else t_BK = t1; #endif if(g_proc_id==0) printf("Operator application time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); squarenorm = square_norm((spinor*)g_bispinor_field[0], 2*VOLUME, 1); if(g_proc_id==0) { printf("# || D_MG w ||^2 = %.16e\n", squarenorm); fflush(stdout); } squarenorm = square_norm((spinor*)g_bispinor_field[1], 2*VOLUME, 1); if(g_proc_id==0) { printf("# || D_BK w ||^2 = %.16e\n\n\n", squarenorm); fflush(stdout); } diff( (spinor*)g_bispinor_field[3], (spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[1], 2*VOLUME); printf("element-wise difference between (D_BK w) and (D_MG w)\n"); printf("( D_MG w - M_BK w )->sp_up.s0.c0= %.16e + I*(%.16e)\n\n", creal(g_bispinor_field[3][0].sp_up.s0.c0), cimag(g_bispinor_field[3][0].sp_up.s0.c0) ); double diffnorm = square_norm( (spinor*) g_bispinor_field[3], 2*VOLUME, 1 ); if(g_proc_id==0){ printf("Square norm of the difference\n"); printf("|| D_MG w - D_BK w ||^2 = %.16e \n\n\n", diffnorm); } // < D w, v > printf("Check consistency of D and D^dagger\n"); _Complex double prod1_MG = scalar_prod( (spinor*)g_bispinor_field[0], (spinor*)g_bispinor_field[5], 2*VOLUME, 1 ); if(g_proc_id==0) printf("< D_MG w, v > = %.16e + I*(%.16e)\n", creal(prod1_MG), cimag(prod1_MG)); _Complex double prod1_BK = scalar_prod( (spinor*)g_bispinor_field[1], (spinor*)g_bispinor_field[5], 2*VOLUME, 1 ); if(g_proc_id==0) printf("< D_BK w, v > = %.16e + I*(%.16e)\n\n", creal(prod1_BK), cimag(prod1_BK)); // < w, D^\dagger v > t_MG = gettime(); D_psi_dagger_BSM2m(g_bispinor_field[6], g_bispinor_field[5]); t_MG = gettime()-t_MG; t_BK = gettime(); D_psi_dagger_BSM2b(g_bispinor_field[7], g_bispinor_field[5]); t_BK = gettime() - t_BK; if(g_proc_id==0) printf("Operator dagger application time: t_MG = %f sec \t t_BK = %f sec\n\n", t_MG, t_BK); _Complex double prod2_MG = scalar_prod((spinor*)g_bispinor_field[4], (spinor*)g_bispinor_field[6], 2*VOLUME, 1); _Complex double prod2_BK = scalar_prod((spinor*)g_bispinor_field[4], (spinor*)g_bispinor_field[7], 2*VOLUME, 1); if( g_proc_id == 0 ){ printf("< w, D_MG^dagger v > = %.16e + I*(%.16e)\n", creal(prod2_MG), cimag(prod2_MG)); printf("< w, D_BK^dagger v > = %.16e + I*(%.16e)\n", creal(prod2_BK), cimag(prod2_BK)); printf("\n| < D_MG w, v > - < w, D_MG^dagger v > | = %.16e\n",cabs(prod2_MG-prod1_MG)); printf("| < D_BK w, v > - < w, D_BK^dagger v > | = %.16e\n\n",cabs(prod2_BK-prod1_BK)); } #if TEST_INVERSION // check result of inversion Q2_psi_BSM2m(g_bispinor_field[10], g_bispinor_field[8]); Q2_psi_BSM2b(g_bispinor_field[11], g_bispinor_field[8]); assign_diff_mul((spinor*)g_bispinor_field[10], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); assign_diff_mul((spinor*)g_bispinor_field[11], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); double squarenorm_MGMG = square_norm((spinor*)g_bispinor_field[10], 2*VOLUME, 1); double squarenorm_BKMG = square_norm((spinor*)g_bispinor_field[11], 2*VOLUME, 1); if(g_proc_id==0) { printf("# ||Q2_MG*(Q2_MG)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_MGMG); printf("# ||Q2_BK*(Q2_MG)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_BKMG); fflush(stdout); } Q2_psi_BSM2b(g_bispinor_field[10], g_bispinor_field[9]); Q2_psi_BSM2m(g_bispinor_field[11], g_bispinor_field[9]); assign_diff_mul((spinor*)g_bispinor_field[10], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); assign_diff_mul((spinor*)g_bispinor_field[11], (spinor*)g_bispinor_field[4], 1.0, 2*VOLUME); double squarenorm_BKBK = square_norm((spinor*)g_bispinor_field[10], 2*VOLUME, 1); double squarenorm_MGBK = square_norm((spinor*)g_bispinor_field[11], 2*VOLUME, 1); if(g_proc_id==0) { printf("# ||Q2_BK*(Q2_BK)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_BKBK); printf("# ||Q2_MG*(Q2_BK)^-1*(b)-b||^2 = %.16e\n\n", squarenorm_MGBK); fflush(stdout); } #endif #ifdef OMP free_omp_accumulators(); #endif free_gauge_field(); free_geometry_indices(); free_bispinor_field(); free_scalar_field(); #ifdef MPI MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif return(0); }
void jdher_bi(int n, int lda, double tau, double tol, int kmax, int jmax, int jmin, int itmax, int blksize, int blkwise, int V0dim, complex *V0, int solver_flag, int linitmax, double eps_tr, double toldecay, int verbosity, int *k_conv, complex *Q, double *lambda, int *it, int maxmin, const int shift_mode, matrix_mult_bi A_psi){ /**************************************************************************** * * * Local variables * * * ****************************************************************************/ /* constants */ /* allocatables: * initialize with NULL, so we can free even unallocated ptrs */ double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL; complex *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res_ = NULL, *Res, *eigwork = NULL, *temp1_ = NULL, *temp1; int *idx1 = NULL, *idx2 = NULL, *convind = NULL, *keepind = NULL, *solvestep = NULL, *actcorrits = NULL; /* non-allocated ptrs */ complex *q, *v, *u, *r = NULL; /* complex *matdummy, *vecdummy; */ /* scalar vars */ double theta, alpha, it_tol; int i, k, j, actblksize, eigworklen, found, conv, keep, n2, N = n*sizeof(complex)/sizeof(bispinor); int act, cnt, idummy, info, CntCorrIts=0, endflag=0; /* variables for random number generator */ int IDIST = 1; int ISEED[4] = {2, 3, 5, 7}; ISEED[0] = g_proc_id; /**************************************************************************** * * * Of course on the CRAY everything is different :( !! * * that's why we need something more. * * ****************************************************************************/ #ifdef CRAY fupl_u = _cptofcd(cupl_u, strlen(cupl_u)); fupl_c = _cptofcd(cupl_c, strlen(cupl_c)); fupl_n = _cptofcd(cupl_n, strlen(cupl_n)); fupl_a = _cptofcd(cupl_a, strlen(cupl_a)); fupl_v = _cptofcd(cupl_v, strlen(cupl_v)); filaenv = _cptofcd(cilaenv, strlen(cilaenv)); fvu = _cptofcd(cvu, strlen(cvu)); #endif /**************************************************************************** * * * Execution starts here... * * * ****************************************************************************/ /* NEW PART FOR GAUGE_COPY */ #ifdef _GAUGE_COPY update_backward_gauge(); #endif /* END NEW PART */ /* print info header */ if (verbosity > 1 && g_proc_id == 0) { printf("Jacobi-Davidson method for hermitian Matrices\n"); printf("Solving A*x = lambda*x \n\n"); printf(" N= %10d ITMAX=%4d\n", n, itmax); printf(" KMAX=%3d JMIN=%3d JMAX=%3d V0DIM=%3d\n", kmax, jmin, jmax, V0dim); printf(" BLKSIZE= %2d BLKWISE= %5s\n", blksize, blkwise ? "TRUE" : "FALSE"); printf(" TOL= %11.4e TAU= %11.4e\n", tol, tau); printf(" LINITMAX= %5d EPS_TR= %10.3e TOLDECAY=%9.2e\n", linitmax, eps_tr, toldecay); printf("\n Computing %s eigenvalues\n", maxmin ? "maximal" : "minimal"); printf("\n"); fflush( stdout ); } /* validate input parameters */ if(tol <= 0) jderrorhandler(401,""); if(kmax <= 0 || kmax > n) jderrorhandler(402,""); if(jmax <= 0 || jmax > n) jderrorhandler(403,""); if(jmin <= 0 || jmin > jmax) jderrorhandler(404,""); if(itmax < 0) jderrorhandler(405,""); if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,""); if(blksize <= 0 || blksize > kmax) jderrorhandler(406,""); if(blkwise < 0 || blkwise > 1) jderrorhandler(407,""); if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,""); if(linitmax < 0) jderrorhandler(409,""); if(eps_tr < 0.) jderrorhandler(500,""); if(toldecay <= 1.0) jderrorhandler(501,""); CONE.re=1.; CONE.im=0.; CZERO.re=0.; CZERO.im=0.; CMONE.re=-1.; CMONE.im=0.; /* Get hardware-dependent values: * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt. * block size... */ eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax; /* Allocating memory for matrices & vectors */ if((void*)(V_ = (complex *)malloc((lda * jmax + 4) * sizeof(complex))) == NULL) { errno = 0; jderrorhandler(300,"V in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) V = (complex*)(((unsigned long int)(V_)+ALIGN_BASE)&~ALIGN_BASE); #else V = V_; #endif if((void*)(U = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"U in jdher_bi"); } if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) { jderrorhandler(300,"s in jdher_bi"); } if((void*)(Res_ = (complex *)malloc((lda * blksize+4) * sizeof(complex))) == NULL) { jderrorhandler(300,"Res in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) Res = (complex*)(((unsigned long int)(Res_)+ALIGN_BASE)&~ALIGN_BASE); #else Res = Res_; #endif if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) { jderrorhandler(300,"resnrm in jdher_bi"); } if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) { jderrorhandler(300,"resnrm_old in jdher_bi"); } if((void*)(M = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"M in jdher_bi"); } if((void*)(Vtmp = (complex *)malloc(jmax * jmax * sizeof(complex))) == NULL) { jderrorhandler(300,"Vtmp in jdher_bi"); } if((void*)(p_work_bi = (complex *)malloc(lda * sizeof(complex))) == NULL) { jderrorhandler(300,"p_work_bi in jdher_bi"); } /* ... */ if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) { jderrorhandler(300,"idx1 in jdher_bi"); } if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) { jderrorhandler(300,"idx2 in jdher_bi"); } /* Indices for (non-)converged approximations */ if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"convind in jdher_bi"); } if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"keepind in jdher_bi"); } if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"solvestep in jdher_bi"); } if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) { jderrorhandler(300,"actcorrits in jdher_bi"); } if((void*)(eigwork = (complex *)malloc(eigworklen * sizeof(complex))) == NULL) { jderrorhandler(300,"eigwork in jdher_bi"); } if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) { jderrorhandler(300,"rwork in jdher_bi"); } if((void*)(temp1_ = (complex *)malloc((lda+4) * sizeof(complex))) == NULL) { jderrorhandler(300,"temp1 in jdher_bi"); } #if (defined SSE || defined SSE2 || defined SSE3) temp1 = (complex*)(((unsigned long int)(temp1_)+ALIGN_BASE)&~ALIGN_BASE); #else temp1 = temp1_; #endif if((void*)(dtemp = (double *)malloc(lda * sizeof(complex))) == NULL) { jderrorhandler(300,"dtemp in jdher_bi"); } /* Set variables for Projection routines */ n2 = 2*n; p_n = n; p_n2 = n2; p_Q_bi = Q; p_A_psi_bi = A_psi; p_lda = lda; /************************************************************************** * * * Generate initial search subspace V. Vectors are taken from V0 and if * * necessary randomly generated. * * * **************************************************************************/ /* copy V0 to V */ _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1); j = V0dim; /* if V0dim < blksize: generate additional random vectors */ if (V0dim < blksize) { idummy = (blksize - V0dim)*n; /* nof random numbers */ _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda); j = blksize; } for (cnt = 0; cnt < j; cnt ++) { ModifiedGS_bi(V + cnt*lda, n, cnt, V, lda); alpha = sqrt(square_norm_bi((bispinor*)(V+cnt*lda), N)); alpha = 1.0 / alpha; _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE); } /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle is computed. */ for (cnt = 0; cnt < j; cnt++){ A_psi((bispinor*) temp1, (bispinor*) (V+cnt*lda)); idummy = cnt+1; for(i = 0; i < idummy; i++) { M[cnt*jmax+i] = scalar_prod_bi((bispinor*)(V+i*lda), (bispinor*) temp1, N); } } /* Other initializations */ k = 0; (*it) = 0; if((*k_conv) > 0) { k = (*k_conv); } actblksize = blksize; for(act = 0; act < blksize; act ++){ solvestep[act] = 1; } /**************************************************************************** * * * Main JD-iteration loop * * * ****************************************************************************/ while((*it) < itmax) { /**************************************************************************** * * * Solving the projected eigenproblem * * * * M*u = V^dagger*A*V*u = s*u * * M is hermitian, only the upper triangle is stored * * * ****************************************************************************/ _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1); _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); if (info != 0) { printf("error solving the projected eigenproblem."); printf(" zheev: info = %d\n", info); } if(info != 0) jderrorhandler(502,"problem in zheev for jdher_bi"); /* Reverse order of eigenvalues if maximal value is needed */ if(maxmin == 1){ sorteig(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); } else{ sorteig(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); } /**************************************************************************** * * * Convergence/Restart Check * * * * In case of convergence, strip off a whole block or just the converged * * ones and put 'em into Q. Update the matrices Q, V, U, s * * * * In case of a restart update the V, U and M matrices and recompute the * * Eigenvectors * * * ****************************************************************************/ found = 1; while(found) { /* conv/keep = Number of converged/non-converged Approximations */ conv = 0; keep = 0; for(act=0; act < actblksize; act++){ /* Setting pointers for single vectors */ q = Q + (act+k)*lda; u = U + act*jmax; r = Res + act*lda; /* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */ theta = s[act]; _FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1); /* Compute the residual */ A_psi((bispinor*) r, (bispinor*) q); theta = -theta; _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); /* Compute norm of the residual and update arrays convind/keepind*/ resnrm_old[act] = resnrm[act]; resnrm[act] = sqrt(square_norm_bi((bispinor*) r, N)); if (resnrm[act] < tol){ convind[conv] = act; conv = conv + 1; } else{ keepind[keep] = act; keep = keep + 1; } } /* for(act = 0; act < actblksize; act ++) */ /* Check whether the blkwise-mode is chosen and ALL the approximations converged, or whether the strip-off mode is active and SOME of the approximations converged */ found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) && (j > actblksize || k == kmax - actblksize); /*************************************************************************** * * * Convergence Case * * * * In case of convergence, strip off a whole block or just the converged * * ones and put 'em into Q. Update the matrices Q, V, U, s * * * **************************************************************************/ if (found) { /* Store Eigenvalues */ for(act = 0; act < conv; act++) lambda[k+act] = s[convind[act]]; /* Re-use non approximated Ritz-Values */ for(act = 0; act < keep; act++) s[act] = s[keepind[act]]; /* Shift the others in the right position */ for(act = 0; act < (j-actblksize); act ++) s[act+keep] = s[act+actblksize]; /* Update V. Re-use the V-Vectors not looked at yet. */ idummy = j - actblksize; for (act = 0; act < n; act = act + jmax) { cnt = act + jmax > n ? n-act : jmax; _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1); _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1); } /* Insert the not converged approximations as first columns in V */ for(act = 0; act < keep; act++){ _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1); } /* Store Eigenvectors */ for(act = 0; act < conv; act++){ _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1); } /* Update SearchSpaceSize j */ j = j - conv; /* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); for (act = 0; act < j; act++){ M[act*jmax + act].re = s[act]; } /* ... and U the Identity(jnew,jnew) */ _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); if(shift_mode == 1){ if(maxmin == 0){ for(act = 0; act < conv; act ++){ if (lambda[k+act] > tau){ tau = lambda[k+act]; } } } else{ for(act = 0; act < conv; act ++){ if (lambda[k+act] < tau){ tau = lambda[k+act]; } } } } /* Update Converged-Eigenpair-counter and Pro_k */ k = k + conv; /* Update the new blocksize */ actblksize=min(blksize, kmax-k); /* Exit main iteration loop when kmax eigenpairs have been approximated */ if (k == kmax){ endflag = 1; break; } /* Counter for the linear-solver-accuracy */ for(act = 0; act < keep; act++) solvestep[act] = solvestep[keepind[act]]; /* Now we expect to have the next eigenvalues */ /* allready with some accuracy */ /* So we do not need to start from scratch... */ for(act = keep; act < blksize; act++) solvestep[act] = 1; } /* if(found) */ if(endflag == 1){ break; } /************************************************************************** * * * Restart * * * * The Eigenvector-Aproximations corresponding to the first jmin * * Petrov-Vectors are kept. if (j+actblksize > jmax) { * * * **************************************************************************/ if (j+actblksize > jmax) { idummy = j; j = jmin; for (act = 0; act < n; act = act + jmax) { /* V = V * U(:,1:j) */ cnt = act+jmax > n ? n-act : jmax; _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1); _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1); } _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); for (act = 0; act < j; act++) M[act*jmax + act].re = s[act]; } } /* while(found) */ if(endflag == 1){ break; } /**************************************************************************** * * * Solving the correction equations * * * * * ****************************************************************************/ /* Solve actblksize times the correction equation ... */ for (act = 0; act < actblksize; act ++) { /* Setting start-value for vector v as zeros(n,1). Guarantees orthogonality */ v = V + j*lda; for (cnt = 0; cnt < n; cnt ++){ v[cnt].re = 0.; v[cnt].im = 0.; } /* Adaptive accuracy and shift for the lin.solver. In case the residual is big, we don't need a too precise solution for the correction equation, since even in exact arithmetic the solution wouldn't be too usefull for the Eigenproblem. */ r = Res + act*lda; if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]){ p_theta = s[act]; } else{ p_theta = tau; } p_k = k + actblksize; /* if we are in blockwise mode, we do not want to */ /* iterate solutions much more, if they have */ /* allready the desired precision */ if(blkwise == 1 && resnrm[act] < tol) { it_tol = pow(toldecay, (double)(-5)); } else { it_tol = pow(toldecay, (double)(-solvestep[act])); } solvestep[act] = solvestep[act] + 1; /* equation and project if necessary */ ModifiedGS_bi(r, n, k + actblksize, Q, lda); /* for(i=0;i<n;i++){ */ /* r[i].re*=-1.; */ /* r[i].im*=-1.; */ /* } */ g_sloppy_precision = 1; /* Solve the correction equation ... */ if (solver_flag == BICGSTAB){ info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } else if(solver_flag == CG){ info = cg_her_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } else{ info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); } g_sloppy_precision = 0; /* Actualizing profiling data */ if (info == -1){ CntCorrIts += linitmax; } else{ CntCorrIts += info; } actcorrits[act] = info; /* orthonormalize v to Q, cause the implicit orthogonalization in the solvers may be too inaccurate. Then apply "IteratedCGS" to prevent numerical breakdown in order to orthogonalize v to V */ ModifiedGS_bi(v, n, k+actblksize, Q, lda); IteratedClassicalGS_bi(v, &alpha, n, j, V, temp1, lda); alpha = 1.0 / alpha; _FT(dscal)(&n2, &alpha, (double*) v, &ONE); /* update interaction matrix M */ A_psi((bispinor*) temp1, (bispinor*) v); idummy = j+1; for(i = 0; i < idummy; i++){ M[j*jmax+i] = scalar_prod_bi((bispinor*) (V+i*lda), (bispinor*) temp1, N); } /* Increasing SearchSpaceSize j */ j ++; } /* for (act = 0;act < actblksize; act ++) */ /* Print information line */ if(g_proc_id == 0) { print_status(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, s, resnrm, actcorrits); } /* Increase iteration-counter for outer loop */ (*it) = (*it) + 1; } /* Main iteration loop */ /****************************************************************** * * * Eigensolutions converged or iteration limit reached * * * * Print statistics. Free memory. Return. * * * ******************************************************************/ *k_conv = k; if (verbosity >= 1) { if(g_proc_id == 0) { printf("\nJDHER execution statistics\n\n"); printf("IT_OUTER=%d IT_INNER_TOT=%d IT_INNER_AVG=%8.2f\n", (*it), CntCorrIts, (double)CntCorrIts/(*it)); printf("\nConverged eigensolutions in order of convergence:\n"); printf("\n I LAMBDA(I) RES(I)\n"); printf("---------------------------------------\n"); } for (act = 0; act < *k_conv; act ++) { /* Compute the residual for solution act */ q = Q + act*lda; theta = -lambda[act]; A_psi((bispinor*) r, (bispinor*) q); _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); alpha = sqrt(square_norm_bi((bispinor*) r, N)); if(g_proc_id == 0) { printf("%3d %22.15e %12.5e\n", act+1, lambda[act], alpha); } } if(g_proc_id == 0) { printf("\n"); fflush( stdout ); } } free(V_); free(Vtmp); free(U); free(s); free(Res_); free(resnrm); free(resnrm_old); free(M); free(Z); free(eigwork); free(temp1_); free(dtemp); free(rwork); free(p_work_bi); free(idx1); free(idx2); free(convind); free(keepind); free(solvestep); free(actcorrits); } /* jdher(.....) */