int lis_quad_sqrt(LIS_QUAD *a, const LIS_QUAD *b) { LIS_QUAD_DECLAR; #ifndef USE_SSE2 LIS_QUAD_SQRT(a->hi,a->lo,b->hi,b->lo); #else LIS_QUAD_SQRT_SSE2(a->hi,a->lo,b->hi,b->lo); #endif return LIS_SUCCESS; }
LIS_INT lis_vector_nrm2ex_mm(LIS_VECTOR vx, LIS_QUAD_PTR *val) { LIS_INT i,n; LIS_SCALAR *x,*xl; LIS_QUAD_PTR dotm2,dotm,tmpm; #ifdef _OPENMP LIS_INT is,ie,nprocs,my_rank; LIS_SCALAR *gt; #endif #ifdef USE_MPI MPI_Comm comm; #endif LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; dotm2.hi = &vx->work[0]; dotm2.lo = &vx->work[2]; dotm.hi = &vx->work[8]; dotm.lo = &vx->work[9]; tmpm.hi = &vx->work[10]; tmpm.lo = &vx->work[11]; #ifdef USE_MPI comm = vx->comm; #endif #ifdef _OPENMP gt = lis_vec_tmp; nprocs = omp_get_max_threads(); #ifndef USE_SSE2 #pragma omp parallel private(i,is,ie,my_rank,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) #else #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif { my_rank = omp_get_thread_num(); LIS_GET_ISIE(my_rank,nprocs,n,is,ie); #ifndef USE_FMA2_SSE2 gt[my_rank*LIS_VEC_TMP_PADD] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; #pragma cdir nodep for(i=is;i<ie;i++) { LIS_QUAD_FSA(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #else gt[my_rank*LIS_VEC_TMP_PADD ] = gt[my_rank*LIS_VEC_TMP_PADD+1] = 0.0; gt[my_rank*LIS_VEC_TMP_PADD+2] = gt[my_rank*LIS_VEC_TMP_PADD+3] = 0.0; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=is;i<ie-1;i+=2) { LIS_QUAD_FSA2_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+2],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD+3]); for(;i<ie;i++) { LIS_QUAD_FSA_SSE2(gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],gt[my_rank*LIS_VEC_TMP_PADD],gt[my_rank*LIS_VEC_TMP_PADD+1],x[i],xl[i]); } #endif } dotm.hi[0] = dotm.lo[0] = 0.0; for(i=0;i<nprocs;i++) { #ifndef USE_SSE2 LIS_QUAD_ADD(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #else LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],gt[i*LIS_VEC_TMP_PADD],gt[i*LIS_VEC_TMP_PADD+1]); #endif } #else #ifndef USE_FMA2_SSE2 dotm.hi[0] = dotm.lo[0] = 0.0; #pragma cdir nodep for(i=0;i<n;i++) { LIS_QUAD_FSA(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #else dotm2.hi[0] = dotm2.hi[1] = 0.0; dotm2.lo[0] = dotm2.lo[1] = 0.0; for(i=0;i<n-1;i+=2) { LIS_QUAD_FSA2_SSE2(dotm2.hi[0],dotm2.lo[0],dotm2.hi[0],dotm2.lo[0],x[i],xl[i]); } LIS_QUAD_ADD_SSE2(dotm.hi[0],dotm.lo[0],dotm2.hi[0],dotm2.lo[0],dotm2.hi[1],dotm2.lo[1]); for(;i<n;i++) { LIS_QUAD_FSA_SSE2(dotm.hi[0],dotm.lo[0],dotm.hi[0],dotm.lo[0],x[i],xl[i]); } #endif #endif #ifdef USE_MPI MPI_Allreduce(dotm.hi,tmpm.hi,1,LIS_MPI_MSCALAR,LIS_MPI_MSUM,comm); #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],tmpm.hi[0],tmpm.lo[0]); #endif #else #ifndef USE_SSE2 LIS_QUAD_SQRT(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #else LIS_QUAD_SQRT_SSE2(val->hi[0],val->lo[0],dotm.hi[0],dotm.lo[0]); #endif #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }