void lis_quad_mul(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c) { LIS_QUAD_DECLAR; #ifndef USE_SSE2 LIS_QUAD_MUL(a->hi,a->lo,b->hi,b->lo,c->hi,c->lo); #else LIS_QUAD_MUL_SSE2(a->hi,a->lo,b->hi,b->lo,c->hi,c->lo); #endif }
LIS_INT lis_vector_scaleex_mm(LIS_QUAD_PTR alpha, LIS_VECTOR vx) { LIS_INT i,n,is,ie,nprocs,my_rank; LIS_QUAD_PTR aa; LIS_SCALAR *x,*xl; LIS_QUAD_DECLAR; LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; xl = vx->value_lo; aa.hi = &vx->work[0]; aa.lo = &vx->work[2]; #ifndef USE_FMA2_SSE2 #pragma cdir nodep #pragma omp parallel for private(i,p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl) for(i=0; i<n; i++) { LIS_QUAD_MUL(x[i],xl[i],x[i],xl[i],alpha.hi[0],alpha.lo[0]); } #else #ifdef _OPENMP nprocs = omp_get_max_threads(); #else nprocs = 1; #endif aa.hi[0] = aa.hi[1] = alpha.hi[0]; aa.lo[0] = aa.lo[1] = alpha.lo[0]; #ifdef _OPENMP #pragma omp parallel private(i,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh,is,ie,my_rank) #endif { #ifdef _OPENMP my_rank = omp_get_thread_num(); #else my_rank = 0; #endif LIS_GET_ISIE(my_rank,nprocs,n,is,ie); for(i=is;i<ie-1;i+=2) { LIS_QUAD_MUL2_SSE2(x[i],xl[i],x[i],xl[i],aa.hi[0],aa.lo[0]); } for(;i<ie;i++) { LIS_QUAD_MUL_SSE2(x[i],xl[i],x[i],xl[i],aa.hi[0],aa.lo[0]); } } #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }