dcovector Van_Der_Pol::Drift_Function(const dcovector & X) { dcovector dX(X.l); dX(0) = X(1); dX(1) = lambda * (1. - X(0) * X(0)) * X(1) - X(0); return dX; }
hMatrix Inverse_Kinematics(hMatrix Initial_T,hMatrix Goal_T,double *Initial_t, double *DH_alpha, double *DH_a, double *DH_d, int joint){ for(int i=0; i<joint; i++){ Initial_theta[i] = *Initial_t; Initial_t++; } hMatrix Initial_Theta(7,1); hMatrix J(6,7), Pinv_J(7,6); hMatrix n_a(3,1),s_a(3,1),a_a(3,1),n_t(3,1),s_t(3,1),a_t(3,1),p_del(3,1); double x,y,z,rx,ry,rz; double error_position[3]= {Goal_T.element(0,3)-Initial_T.element(0,3),Goal_T.element(1,3)-Initial_T.element(1,3),Goal_T.element(2,3)-Initial_T.element(2,3)}; hMatrix P(3,1),R(3,1),Rotation(3,3),dx_temp1(3,1),dx_temp2(3,1),dX(6,1),del_Theta(7,1),Temp(7,1); Initial_Theta.SET(7,1,Initial_theta); Initial_T = T_hMatrix(&Initial_theta[0], &DH_alpha[0], &DH_a[0], &DH_d[0], joint); J = Jacobian_hMatrix(&Initial_theta[0], &DH_alpha[0], &DH_a[0], &DH_d[0]); Pinv_J = Pseudo_Inverse(J); for(int i = 0; i<3; i++){ n_a.SetElement(i,0,Initial_T.element(i,0)); s_a.SetElement(i,0,Initial_T.element(i,1)); a_a.SetElement(i,0,Initial_T.element(i,2)); n_t.SetElement(i,0,Goal_T.element(i,0)); s_t.SetElement(i,0,Goal_T.element(i,1)); a_t.SetElement(i,0,Goal_T.element(i,2)); p_del.SetElement(i,0,Goal_T.element(i,3)-Initial_T.element(i,3)); } x = dot(n_a, p_del); y = dot(s_a, p_del); z = dot(a_a, p_del); ; rx = (dot(a_a,s_t)-dot(a_t,s_a))/2; ry = (dot(n_a,a_t)-dot(n_t,a_a))/2; rz = (dot(s_a,n_t)-dot(s_t,n_a))/2; double dx_P[3] = {x,y,z},dx_R[3] = {rx,ry,rz}; P.SET(3,1,&dx_P[0]); R.SET(3,1,&dx_R[0]); Rotation = T_Rotation(Initial_T); dx_temp1 = Rotation*P; dx_temp2 = Rotation*R; for(int i =0; i<3; i++){ dX.SetElement(i,0,dx_temp1.element(i,0)); dX.SetElement(i+3,0,dx_temp2.element(i,0)); } del_Theta = Pinv_J*dX; for(int i=0; i<joint; i++) Temp.SetElement(i,0,Initial_Theta.element(i,0) + del_Theta.element(i,0)); Initial_Theta = Temp; return Initial_Theta; }
bool AABB::overlaps(AABB& other) { if (dX(other) > other.R.x + R.x) return false; if (dY(other) > other.R.y + R.y) return false; return true; }
/*! * \brief backward * cache: [N, C, Hx, Wx] * dout: [N, F, Hx/2, Wx/2] * \param[in] const Blob* dout dout * \param[in] const vector<Blob*>& cache cache[0]:X * \param[out] vector<Blob*>& grads grads[0]:dX */ void PoolLayer::backward(shared_ptr<Blob>& dout, const vector<shared_ptr<Blob>>& cache, vector<shared_ptr<Blob>>& grads, Param& param) { int N = cache[0]->get_N(); int C = cache[0]->get_C(); int Hx = cache[0]->get_H(); int Wx = cache[0]->get_W(); int Hy = dout->get_H(); int Wy = dout->get_W(); int height = param.pool_height; int width = param.pool_width; int stride = param.pool_stride; shared_ptr<Blob> dX(new Blob(cache[0]->size(), TZEROS)); for (int n = 0; n < N; ++n) { for (int c = 0; c < C; ++c) { for (int hh = 0; hh < Hy; ++hh) { for (int ww = 0; ww < Wy; ++ww) { mat window = (*cache[0])[n](span(hh * stride, hh * stride + height - 1), span(ww * stride, ww * stride + width - 1), span(c, c)); double maxv = window.max(); mat mask = conv_to<mat>::from(maxv == window); (*dX)[n](span(hh * stride, hh * stride + height - 1), span(ww * stride, ww * stride + width - 1), span(c, c)) += mask * (*dout)[n](hh, ww, c); } } } } grads[0] = dX; return; }
bool ElasticCable::evalSol (Vector& s, const FiniteElement& fe, const Vec3& X, const std::vector<int>& MNPC) const { // Extract element displacements Vector eV; int ierr = 0; if (!primsol.empty() && !primsol.front().empty()) if ((ierr = utl::gather(MNPC,3,primsol.front(),eV))) { std::cerr <<" *** ElasticCable::evalSol: Detected "<< ierr <<" node numbers out of range."<< std::endl; return false; } // Set up reference and current configuration Vec3 dX(fe.G.getColumn(1)); Vec3 ddX(fe.G.getColumn(2)); Vec3 x(X); Vec3 dx(dX); Vec3 ddx(ddX); for (size_t i = 0; i < 3; i++) { x[i] += eV.dot(fe.N,i,3); dx[i] += eV.dot(fe.dNdX,i,3); ddx[i] += eV.dot(fe.d2NdX2,i,3); } #if INT_DEBUG > 1 std::cout <<"ElasticCable: X = "<< X <<" u = "<< X-x <<"\n"; std::cout <<"ElasticCable: x = "<< x <<" dx = "<< dx <<" ddx = "<< ddx <<"\n"; #endif // Compute local coordinate systems of the reference and current configuration Vec3 B_unit, N_unit; double B_len, N_len; if (!evalLocalAxes(dX,ddX,B_unit,N_unit,B_len,N_len)) return false; Vec3 b_unit, n_unit; double b_len, n_len; if (!evalLocalAxes(dx,ddx,b_unit,n_unit,b_len,n_len)) return false; #if INT_DEBUG > 1 std::cout <<"ElasticCable: N_unit = "<< N_unit <<" n_unit = "<< n_unit << std::endl; #endif s.resize(2); s[0] = EA*0.5*(dx*dx - dX*dX); // Axial force s[1] = EI*(ddx*n_unit - ddX*N_unit); // Bending moment return true; }
/*! * \brief backward, dX = dout .* (X > 0) * in: [N, C, Hx, Wx] * dout: [N, F, Hx, Wx] * \param[in] const Blob* dout dout * \param[in] const vector<Blob*>& cache cache[0]:X * \param[out] vector<Blob*>& grads grads[0]:dX */ void ReluLayer::backward(shared_ptr<Blob>& dout, const vector<shared_ptr<Blob>>& cache, vector<shared_ptr<Blob>>& grads) { shared_ptr<Blob> dX(new Blob(*cache[0])); int N = cache[0]->get_N(); for (int i = 0; i < N; ++i) { (*dX)[i].transform([](double e) {return e > 0 ? 1 : 0;}); } (*dX) = (*dout) * (*dX); grads[0] = dX; return; }
void vpTemplateTrackerWarp::findWarp(const double *ut0,const double *vt0,const double *u,const double *v,int nb_pt,vpColVector& p) { vpMatrix dW_(2,nbParam); vpMatrix dX(2,1); vpMatrix H(nbParam,nbParam), HLM(nbParam,nbParam); vpMatrix G(nbParam,1); int cpt=0; vpColVector X1(2); vpColVector fX1(2); vpColVector X2(2); double erreur=0; double erreur_prec; double lambda=0.01; do { erreur_prec=erreur; H=0; G=0; erreur=0; computeCoeff(p); for(int i=0;i<nb_pt;i++) { X1[0]=ut0[i]; X1[1]=vt0[i]; computeDenom(X1,p); warpX(X1,fX1,p); dWarp(X1,fX1,p,dW_); H+=dW_.AtA(); X2[0]=u[i]; X2[1]=v[i]; dX=X2-fX1; G+=dW_.t()*dX; erreur+=((u[i]-fX1[0])*(u[i]-fX1[0])+(v[i]-fX1[1])*(v[i]-fX1[1])); } vpMatrix::computeHLM(H, lambda, HLM); try{ p+=HLM.inverseByLU()*G; } catch(vpException &e) { //std::cout<<"Cannot inverse the matrix by LU " << std::endl; throw(e); } cpt++; } while((cpt<150)&&(sqrt((erreur_prec-erreur)*(erreur_prec-erreur))>1e-20)); //std::cout<<"erreur apres transformation="<<erreur<<std::endl; }
void cheb(int N, Array<double, 1> &x, Array<double, 2> &D) { int i,j; int sign; Array<double, 1> dsum(N+2); Array<double, 1> c(N+2); Array<double, 2> dX(N+2, N+2); //Resize output arrays x.resize(N+2); D.resize(N+2, N+2); /* Start initialize and checking input*/ if (N==0) return; sign = 1; /* Start computing */ for (i=0; i<=N+1; i++) { x(i) = cos(M_PI*i/(N+1)); c(i) = sign; sign = -sign; } c(0) = 2; c(N+1) = 2*c(N+1); for (i=0; i<=N+1; i++) { for (j=0; j<=N+1; j++) { dX(i,j) = x(i) - x(j); if (i==j) dX(i,j)++; } } for (i=0; i<=N+1; i++) { dsum(i) = 0.0; for (j=0; j<=N+1; j++) { D(i,j) = c(i)/c(j)/dX(i,j); dsum(i) += D(i,j); } } for (i=0; i<=N+1; i++) D(i,i) -= dsum(i); } // Done
/*! * \brief backward * in: [N, C, Hx, Wx] * dout: [N, F, Hx, Wx] * \param[in] const Blob* dout dout * \param[in] const vector<Blob*>& cache cache[0]:X * \param[in] Param& param int mode, double p, int seed, Blob *mask * \param[out] vector<Blob*>& grads grads[0]:dX */ void DropoutLayer::backward(shared_ptr<Blob>& dout, const vector<shared_ptr<Blob>>& cache, vector<shared_ptr<Blob>>& grads, Param& param) { shared_ptr<Blob> dX(new Blob((*dout))); int mode = param.drop_mode; assert(0 <= mode && mode <= 3); if ((mode & 1) == 1) { Blob dx_mask = (*dX) * (*param.drop_mask); *dX = dx_mask / param.drop_p; } grads[0] = dX; return; }
returnValue Integrator::integrateSensitivities( ){ uint run1; returnValue returnvalue; if( ( nBDirs > 0 || nBDirs2 > 0 ) && transition != 0 ){ int order; if( nBDirs2 > 0 ) order = 2; else order = 1; returnvalue = diffTransitionBackward( dXb, dPb, dUb, dWb, order ); setBackwardSeed( order, dXb ); if( returnvalue != SUCCESSFUL_RETURN ) return ACADOERROR(returnvalue); } returnvalue = evaluateSensitivities(); if( returnvalue != SUCCESSFUL_RETURN ) return ACADOERROR(returnvalue); if( nBDirs > 0 || nBDirs2 > 0 ) return SUCCESSFUL_RETURN; int order = 1; if( nFDirs2 > 0 ) order = 2; Matrix tmp( rhs->getDim(), 1 ); returnvalue = getProtectedForwardSensitivities(&tmp,order); Vector components = rhs->getDifferentialStateComponents(); dX.init(rhs->getDim()-ma); dX.setZero(); for( run1 = 0; run1 < components.getDim(); run1++ ) dX((int) components(run1)) = tmp(run1,0); if( returnvalue != SUCCESSFUL_RETURN ) return ACADOERROR(returnvalue); if( transition != 0 ) returnvalue = diffTransitionForward( dX, dP, dU, dW, order ); return returnvalue; }
/*! * \brief backward * in: [N, C, Hx, Wx] * weight: [F, C, Hw, Ww] * bias: [F, 1, 1, 1] * out: [N, F, (Hx+pad*2-Hw)/stride+1, (Wx+pad*2-Ww)/stride+1] * \param[in] const Blob* dout dout * \param[in] const vector<Blob*>& cache cache[0]:X, cache[1]:weights, cache[2]:bias * \param[out] vector<Blob*>& grads grads[0]:dX, grads[1]:dW, grads[2]:db */ void ConvLayer::backward(shared_ptr<Blob>& dout, const vector<shared_ptr<Blob>>& cache, vector<shared_ptr<Blob>>& grads, Param& param) { int N = cache[0]->get_N(); int F = cache[1]->get_N(); int C = cache[0]->get_C(); int Hx = cache[0]->get_H(); int Wx = cache[0]->get_W(); int Hw = cache[1]->get_H(); int Ww = cache[1]->get_W(); int Hy = dout->get_H(); int Wy = dout->get_W(); assert(C == cache[1]->get_C()); assert(F == cache[2]->get_N()); shared_ptr<Blob> dX(new Blob(cache[0]->size(), TZEROS)); shared_ptr<Blob> dW(new Blob(cache[1]->size(), TZEROS)); shared_ptr<Blob> db(new Blob(cache[2]->size(), TZEROS)); Blob pad_dX(N, C, Hx + param.conv_pad*2, Wx + param.conv_pad*2, TZEROS); Blob pad_X = (*cache[0]).pad(1); for (int n = 0; n < N; ++n) { for (int f = 0; f < F; ++f) { for (int hh = 0; hh < Hy; ++hh) { for (int ww = 0; ww < Wy; ++ww) { cube window = pad_X[n](span(hh * param.conv_stride, hh * param.conv_stride + Hw - 1), span(ww * param.conv_stride, ww * param.conv_stride + Ww - 1), span::all); (*db)[f](0, 0, 0) += (*dout)[n](hh, ww, f); (*dW)[f] += window * (*dout)[n](hh, ww, f); pad_dX[n](span(hh * param.conv_stride, hh * param.conv_stride + Hw - 1), span(ww * param.conv_stride, ww * param.conv_stride + Ww - 1), span::all) += (*cache[1])[f] * (*dout)[n](hh, ww, f); } } } } *dX = pad_dX.dePad(param.conv_pad); grads[0] = dX; grads[1] = dW; grads[2] = db; return; }
void Obstacle::buildDefault() { Point3D left(lX, lY, lZ), right(rX, rY, rZ); Vector3D dX(rX - lX, 0, 0), dY(0, rY - lY, 0), dZ(0, 0, rZ - lZ); tops = {left, left + dX, left + dY, left + dZ, right - dZ, right - dX, right - dY, right}; edges = { Edge(0,1), Edge(0,2), Edge(0,3), Edge(1,4), Edge(1,6), Edge(2,4), Edge(2,5), Edge(3,5), Edge(3,6), Edge(4,7), Edge(5,7), Edge(6,7) }; facets = { Facet({ 0, 1, 4, 2 }), Facet({ 0, 1, 6, 3 }), Facet({ 0, 2, 5, 3 }), Facet({ 1, 4, 7, 6 }), Facet({ 2, 4, 7, 5 }), Facet({ 3, 6, 7, 5 }) }; }
void evader::TranslateEvader(void) { const double kDeltaPos = 0.01; vector3d dX( kDeltaPos, 0.0, 0.0 ); vector3d dY( 0.0, kDeltaPos, 0.0 ); vector3d dZ( 0.0, 0.0, kDeltaPos ); Uint8* state=SDL_GetKeyState(NULL); if(state[SDLK_j]) transConfigE.Translate(dX*(-1.0)); if(state[SDLK_l]) transConfigE.Translate(dX); if(state[SDLK_i]) transConfigE.Translate(dZ*(-1.0)); if(state[SDLK_k]) transConfigE.Translate(dZ); }
std::vector<double> TCNN_opt_function::calcFunc(std::vector<double> const &X) { std::vector<double> dX(X.size()); std::vector<double> chaoticValue; for (auto *p_chaos : chaos_fuctions) { std::vector<double> tmp = p_chaos->solve_get_next(); chaoticValue.push_back(tmp[3]); } double exp_chaotic_coeff = chaotic_coeff / std::exp(X[0] * chaotic_reduce_coeff); dX[0] = 1; for (unsigned i = 1; i < X.size(); ++i) { dX[i] = exp_chaotic_coeff*chaoticValue[i-1] - alpha * optimized_function->dF(X,i,0.001); //0.00001 } // chaotic_coeff *= chaotic_reduce_coeff; return dX; }
bool QFFitFunctionGeneral2LogNormal::estimateInitial(double *params, const double *dataX, const double *dataY, long N, const bool* /*fix*/) const { if (params && dataX && dataY) { StatisticsScopedPointer<double> dX(statisticsDuplicateAndApply(dataX, N, log)); double pW=0; double pB=0; double pH=0; double pP=0; double pW2=0; double pH2=0; double pP2=0; if (statistics2PeakFind(pP, pW, pP2, pW2, dX.data(), dataY, N, 0.0, (double)NAN, &pB, &pH, &pH2)) { double dx=0; statisticsMinDistance(dataX, N, &dx); if (dx>0) { pW=qMax(pW,6.0*dx); pW2=qMax(pW2,6.0*dx); } params[PARAM_OFFSET]=pB; params[PARAM_AMPLITUDE]=pH; params[PARAM_POSITION]=pP; params[PARAM_WIDTH]=pW/2.3548; if (statisticsFloatIsOK(pP2)) { params[PARAM_AMPLITUDE2]=pH2; params[PARAM_POSITION2]=pP2; params[PARAM_WIDTH2]=pW2/2.3548; } return true; } else { return false; } return true; } return true; }
std::vector<double> TSPsolver::calcFunc(std::vector<double> const &X) { std::vector<double> dX(X.size()); std::vector<double> chaoticValue; for (auto elem : chaos) { for (auto *p_chaos : elem) { std::vector<double> tmp = p_chaos->solve_get_next(); chaoticValue.push_back(tmp[3]); } } dX[0] = 1; for (unsigned i = 1; i < X.size(); ++i) { dX[i] = chaotic_coeff*chaoticValue[(i-1)] - alpha * dF(X,i-1); } chaotic_coeff *= chaotic_reduce_coeff; return dX; }
bool ElasticCable::evalInt (LocalIntegral& elmInt, const FiniteElement& fe, const Vec3& X) const { size_t a, aa, b, bb; unsigned char i, j, k, l, o; const size_t nen = fe.N.size(); // Set up reference configuration Vec3 dX(fe.G.getColumn(1)); Vec3 ddX(fe.G.getColumn(2)); #if INT_DEBUG > 1 std::cout <<"ElasticCable: X = "<< X <<" dX = "<< dX <<" ddX = "<< ddX <<"\n"; #endif // Compute current configuration ElmMats& elMat = static_cast<ElmMats&>(elmInt); const Vector& eV = elMat.vec.front(); // Current displacement vector Vec3 x(X); Vec3 dx(dX); Vec3 ddx(ddX); for (i = 0; i < 3; i++) { x[i] += eV.dot(fe.N,i,3); dx[i] += eV.dot(fe.dNdX,i,3); ddx[i] += eV.dot(fe.d2NdX2,i,3); } #if INT_DEBUG > 1 std::cout <<"ElasticCable: x = "<< x <<" dx = "<< dx <<" ddx = "<< ddx <<"\n"; #endif // Compute local coordinate systems of the reference and current configuration Vec3 B_unit, N_unit; double B_len, N_len; if (!evalLocalAxes(dX,ddX,B_unit,N_unit,B_len,N_len)) return false; #if INT_DEBUG > 1 std::cout <<"ElasticCable: B_unit = "<< B_unit <<" N_unit = "<< N_unit <<"\n"; #endif Vec3 b_unit, n_unit; double b_len, n_len; if (!evalLocalAxes(dx,ddx,b_unit,n_unit,b_len,n_len)) return false; Vec3 bin = b_unit * b_len; double b_len2 = b_len * b_len; Vec3 n = n_unit * n_len; double n_len2 = n_len * n_len; #if INT_DEBUG > 1 std::cout <<"ElasticCable: b = "<< bin <<" b_unit = "<< b_unit <<"\n n = "<< n <<" n_unit = "<< n_unit << std::endl; #endif // Calculate derivative of b_unit std::vector<Matrix> db(nen,Matrix(3,3)), db_unit(nen,Matrix(3,3)); std::vector<Vec3> db_normal(nen); for (i = 1; i <= 3; i++) for (k = 1; k <= 3; k++) for (l = 1; l <= 3; l++) { double eps_kli = 0.5*(k-l)*(l-i)*(i-k); double eps_kil = 0.5*(k-i)*(i-l)*(l-k); for (a = 1; a <= nen; a++) db[a-1](k,i) += (eps_kil*fe.dNdX(a,1)*ddx[l-1] + eps_kli*dx[l-1]*fe.d2NdX2(a,1,1)); } for (i = 1; i <= 3; i++) for (a = 0; a < nen; a++) for (k = 1; k <= 3; k++) db_normal[a][i-1] += b_unit[k-1]*db[a](k,i); for (i = 1; i <= 3; i++) for (k = 1; k <= 3; k++) for (a = 0; a < nen; a++) db_unit[a](k,i) += (db[a](k,i) - b_unit[k-1]*db_normal[a][i-1])/b_len; #if INT_DEBUG > 2 std::cout <<"ElasticCable: db_unit:\n"; for (a = 0; a < nen; a++) std::cout <<"node "<< a+1 << db_unit[a]; #endif // Calculate second derivative of b_unit std::vector< std::vector<Matrix3D> > ddb(nen), ddb_unit(nen); std::vector< std::vector<Matrix> > ddb_normal(nen); for (a = 0; a < nen; a++) { ddb[a].resize(nen,Matrix3D(3,3,3)); ddb_unit[a].resize(nen,Matrix3D(3,3,3)); ddb_normal[a].resize(nen,Matrix(3,3)); } for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (k = 1; k <= 3; k++) { double eps_kij = 0.5*(k-i)*(i-j)*(j-k); double eps_kji = 0.5*(k-j)*(j-i)*(i-k); for (a = 1; a <= nen; a++) for (b = 1; b <= nen; b++) ddb[a-1][b-1](k,i,j) = (eps_kji*fe.d2NdX2(a,1,1)*fe.dNdX(b,1) + eps_kij*fe.d2NdX2(b,1,1)*fe.dNdX(a,1)); } #if INT_DEBUG > 3 std::cout <<"ElasticCable: ddb:\n"; for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) std::cout <<"nodes "<< a+1 <<","<< b+1 << ddb[a][b]; #endif for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) for (k = 1; k <= 3; k++) ddb_normal[a][b](i,j) += (ddb[a][b](k,i,j)*bin[k-1] + db[a](k,i)*db[b](k,j) - bin[k-1]*db[a](k,i)*bin[k-1]*db[b](k,j) / b_len2) / b_len; #if INT_DEBUG > 3 std::cout <<"ElasticCable: ddb_normal:\n"; for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) std::cout <<"nodes "<< a+1 <<","<< b+1 << ddb_normal[a][b]; #endif for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) for (k = 1; k <= 3; k++) ddb_unit[a][b](k,i,j) = (ddb[a][b](k,i,j)/b_len - db[a](k,i)*db_normal[b][j-1]/b_len2 - db[b](k,j)*db_normal[a][i-1]/b_len2 - bin[k-1]*(ddb_normal[a][b](i,j) - db_normal[a][i-1]* db_normal[b][j-1]*2.0 / b_len) / b_len2); #if INT_DEBUG > 2 std::cout <<"ElasticCable: ddb_unit:\n"; for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) std::cout <<"nodes "<< a+1 <<","<< b+1 << ddb_unit[a][b]; #endif // Calculate derivative of n_unit std::vector<Matrix> dn(nen,Matrix(3,3)), dn_unit(nen,Matrix(3,3)); std::vector<Vec3> dn_normal(nen); for (i = 1; i <= 3; i++) for (k = 1; k <= 3; k++) for (l = 1; l <= 3; l++) { double eps_kli = 0.5*(k-l)*(l-i)*(i-k); for (a = 0; a < nen; a++) { dn[a](k,i) += eps_kli*b_unit[l-1]*fe.dNdX(1+a,1); for (o = 1; o <= 3; o++) { double eps_kol = 0.5*(k-o)*(o-l)*(l-k); dn[a](k,i) += eps_kol*db_unit[a](o,i)*dx[l-1]; } } } for (i = 1; i <= 3; i++) for (a = 0; a < nen; a++) for (k = 1; k <= 3; k++) dn_normal[a][i-1] += n_unit[k-1]*dn[a](k,i); for (i = 1; i <= 3; i++) for (k = 1; k <= 3; k++) for (a = 0; a < nen; a++) dn_unit[a](k,i) += (dn[a](k,i) - n_unit[k-1]*dn_normal[a][i-1])/n_len; #if INT_DEBUG > 2 std::cout <<"\nElasticCable: dn_unit:\n"; for (a = 0; a < nen; a++) std::cout <<"node "<< a+1 << dn_unit[a]; #endif // Calculate second derivative of n_unit std::vector< std::vector<Matrix3D> > ddn(nen), ddn_unit(nen); std::vector< std::vector<Matrix> > ddn_normal(nen); for (a = 0; a < nen; a++) { ddn[a].resize(nen,Matrix3D(3,3,3)); ddn_unit[a].resize(nen,Matrix3D(3,3,3)); ddn_normal[a].resize(nen,Matrix(3,3)); } for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) for (k = 1; k <= 3; k++) for (o = 1; o <= 3; o++) { double eps_koj = 0.5*(k-o)*(o-j)*(j-k); double eps_koi = 0.5*(k-o)*(o-i)*(i-k); ddn[a][b](k,i,j) += (eps_koj*db_unit[a](o,i)*fe.dNdX(1+b,1) + eps_koi*db_unit[b](o,j)*fe.dNdX(1+a,1)); for (l = 1; l <= 3; l++) { double eps_kol = 0.5*(k-o)*(o-l)*(l-k); ddn[a][b](k,i,j) += eps_kol*ddb_unit[a][b](o,i,j)*dx[l-1]; } } for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) for (k = 1; k <= 3; k++) ddn_normal[a][b](i,j) += (ddn[a][b](k,i,j)*n[k-1] + dn[a](k,i)*dn[b](k,j) - n[k-1]*dn[a](k,i)* n[k-1]*dn[b](k,j)/n_len2) / n_len; for (i = 1; i <= 3; i++) for (j = 1; j <= 3; j++) for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) for (k = 1; k <= 3; k++) ddn_unit[a][b](k,i,j) = (ddn[a][b](k,i,j)/n_len - dn[a](k,i)*dn_normal[b][j-1]/n_len2 - dn[b](k,j)*dn_normal[a][i-1]/n_len2 - n[k-1]*(ddn_normal[a][b](i,j) - dn_normal[a][i-1]* dn_normal[b][j-1]*2.0 / n_len) / n_len2); #if INT_DEBUG > 2 std::cout <<"ElasticCable: ddn_unit:\n"; for (a = 0; a < nen; a++) for (b = 0; b < nen; b++) std::cout <<"nodes "<< a+1 <<","<< b+1 << ddn_unit[a][b]; #endif // Axial strain double eps = 0.5*(dx*dx - dX*dX); // Derivative of the axial strain Vector deps(3*nen); for (a = aa = 1; a <= nen; a++) for (i = 1; i <= 3; i++, aa++) deps(aa) = fe.dNdX(a,1)*dx[i-1]; // Second derivative of the axial strain Matrix ddeps(3*nen,3*nen); for (a = 1; a <= nen; a++) for (b = 1; b <= nen; b++) for (i = 1; i <= 3; i++) ddeps(3*(a-1)+i,3*(b-1)+i) = fe.dNdX(a,1)*fe.dNdX(b,1); // Curvature double kappa = (ddx*n_unit - ddX*N_unit); // Derivative of the curvature Vector dkappa(3*nen); for (a = aa = 1; a <= nen; a++) for (i = 1; i <= 3; i++, aa++) { dkappa(aa) = fe.d2NdX2(a,1,1)*n_unit[i-1]; for (k = 1; k <= 3; k++) dkappa(aa) += ddx[k-1]*dn_unit[a-1](k,i); } // Second derivative of the curvature Matrix ddkappa(3*nen,3*nen); for (a = 0, aa = 1; a < nen; a++) for (i = 1; i <= 3; i++, aa++) for (b = 0, bb = 1; b < nen; b++) for (j = 1; j <= 3; j++, bb++) { ddkappa(aa,bb) = (fe.d2NdX2(1+a,1,1)*dn_unit[b](i,j) + fe.d2NdX2(1+b,1,1)*dn_unit[a](j,i)); for (k = 1; k <= 3; k++) ddkappa(aa,bb) += ddx[k-1]*ddn_unit[a][b](k,i,j); } #if INT_DEBUG > 1 std::cout <<"ElasticCable: eps = "<< eps <<" kappa = "<< kappa <<"\ndeps:"<< deps <<"dkappa:"<< dkappa <<"ddeps:"<< ddeps <<"ddkappa:"<< ddkappa; #endif // Norm of initial contravariant basis (G^1) double normG1contr2 = 1.0 / (dX.x*dX.x + dX.y*dX.y + dX.z*dX.z); double normG1contr4JW = normG1contr2 * normG1contr2 * fe.detJxW; double EAxJW = EA * normG1contr4JW; // volume-weighted axial stiffness double EIxJW = EI * normG1contr4JW; // volume-weighted bending stiffness if (iS) { // Integrate the internal forces (note the negative sign here) elMat.b[iS-1].add(deps,-eps*EAxJW); elMat.b[iS-1].add(dkappa,-kappa*EIxJW); } if (eKm) { // Integrate the material stiffness matrix elMat.A[eKm-1].outer_product(deps,deps*EAxJW,true); elMat.A[eKm-1].outer_product(dkappa,dkappa*EIxJW,true); } if (eKg) { // Integrate the geometric stiffness matrix elMat.A[eKg-1].add(ddeps,eps*EAxJW); elMat.A[eKg-1].add(ddkappa,kappa*EIxJW); } if (lineMass > 0.0) { double dMass = lineMass*fe.detJxW; if (eM) { // Integrate the mass matrix Matrix& M = elMat.A[eM-1]; for (a = 1; a <= nen; a++) for (b = 1; b <= nen; b++) for (i = 1; i <= 3; i++) M(3*(a-1)+i,3*(b-1)+i) += fe.N(a)*fe.N(b)*dMass; } if (eS && !gravity.isZero()) { // Integrate the external (gravitation) forces Vector& S = elMat.b[eS-1]; for (a = 1; a <= nen; a++) for (i = 1; i <= 3; i++) S(3*(a-1)+i) += fe.N(a)*gravity[i-1]*dMass; } } return true; }
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
extern "C" magma_int_t magma_zcgeqrsv_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). dB (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,M). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGEQRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dworkd, *hworkd; magmaFloatComplex *dworks, *hworks; magmaDoubleComplex *dR, *tau, *dT; magmaFloatComplex *dSA, *dSX, *dST, *stau; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange('I', m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
extern "C" magma_int_t magma_zcposv_gpu(char uplo, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZCPOSV computes the solution to a complex system of linear equations A * X = B, where A is an N-by-N Hermitian positive definite matrix and X and B are N-by-NRHS matrices. ZCPOSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= UPLO (input) CHARACTER = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if iterative refinement has been successfully used (INFO.EQ.0 and ITER.GE.0, see description below), then A is unchanged, if double factorization has been used (INFO.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factor U or L from the Cholesky factorization A = U**T*U or A = L*L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). dB (input) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The N-by-NRHS right hand side matrix B. LDDB (input) INTEGER The leading dimension of the array dB. LDDB >= max(1,N). dX (output) COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If INFO = 0, the N-by-NRHS solution matrix X. LDDX (input) INTEGER The leading dimension of the array dX. LDDX >= max(1,N). dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. ITER (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SPOTRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i of (DOUBLE PRECISION) A is not positive definite, so the factorization could not be completed, and the solution has not been computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddsx = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlanhe('I', uplo, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlat2c( uplo, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cpotrf_gpu( uplo, n, dSA, lddsa, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( n, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cpotrs_gpu( uplo, n, nrhs, dSA, lddsa, dSX, lddsx, info ); // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR = dB for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zhemv( uplo, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zhemm( MagmaLeft, uplo, n, nrhs, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zpotrf_gpu( uplo, n, dA, ldda, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zpotrs_gpu( uplo, n, nrhs, dA, ldda, dX, lddx, info ); } return *info; }
/* >>> start tutorial code >>> */ int main( ){ USING_NAMESPACE_ACADO // Define a Right-Hand-Side: // ------------------------- DifferentialState x; DifferentialEquation f; Transition j; TIME t; f << dot(x) == -x - 1.0; j << x == x*x; // Define an integrator: // --------------------- IntegratorRK45 integrator; integrator.init( f, j ); // Define an initial value: // ------------------------ double x_start[1] = { 0.0 }; double t_start = 0.0; double t_end = 1.0; // START THE INTEGRATION // ---------------------- integrator.set( INTEGRATOR_PRINTLEVEL, MEDIUM ); integrator.set( INTEGRATOR_TOLERANCE, 1.0e-6 ); integrator.freezeAll(); integrator.integrate( t_start, t_end, x_start ); Vector xEnd; integrator.getX(xEnd); xEnd.print(); Vector seed(1); seed(0) = 1.0; integrator.setBackwardSeed( 1, seed ); integrator.integrateSensitivities(); Vector dX(1), dP, dU, dW; integrator.getBackwardSensitivities( dX, dP, dU, dW, 1 ); dX.print("dX "); return 0; }
/** Purpose ------- ZCGESV computes the solution to a complex system of linear equations A * X = B, A**T * X = B, or A**H * X = B, where A is an N-by-N matrix and X and B are N-by-NRHS matrices. ZCGESV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). @param[out] dipiv INTEGER array on the GPU, dimension (N) The pivot indices; for 1 <= i <= N, after permuting, row i of the matrix was moved to row dIPIV(i). Note this is different than IPIV, where interchanges are applied one-after-another. @param[in] dB COMPLEX_16 array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[out] dX COMPLEX_16 array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_zgesv_driver ********************************************************************/ extern "C" magma_int_t magma_zcgesv_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside zcgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_zaxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/** Purpose ------- DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by DGEBRD. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. @param[in] n INTEGER The number of columns in the matrix A. @param[in] nb INTEGER The number of leading rows and columns of A to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. \n If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA,N) Copy of A on GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] d DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). @param[out] e DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. @param[out] tauq DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] X DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. @param[in] ldx INTEGER The leading dimension of the array X. LDX >= M. @param[out] dX DOUBLE_PRECISION array, dimension (LDDX,NB) Copy of X on GPU. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= M. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[out] dY DOUBLE_PRECISION array, dimension (LDDY,NB) Copy of Y on GPU. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= N. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) @endverbatim where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_aux ********************************************************************/ extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *dA, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *X, magma_int_t ldx, double *dX, magma_int_t lddx, double *Y, magma_int_t ldy, double *dY, magma_int_t lddy) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define X(i_,j_) (X + (i_) + (j_)*ldx) #define Y(i_,j_) (Y + (i_) + (j_)*ldy) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dY(i_,j_) (dY + (i_) + (j_)*lddy) #define dX(i_,j_) (dX + (i_) + (j_)*lddx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i__2, i__3; magma_int_t i; double alpha; A -= 1 + lda; X -= 1 + ldx; dX -= 1 + lddx; Y -= 1 + ldy; dY -= 1 + lddy; --d; --e; --tauq; --taup; /* Quick return if possible */ magma_int_t info = 0; if (m <= 0 || n <= 0) { return info; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (m >= n) { /* Reduce to upper bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i:m,i) */ i__2 = m - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i,1), &lda, Y(i,1), &ldy, &c_one, A(i,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i,1), &ldx, A(1,i), &ione, &c_one, A(i,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = *A(i,i); i__2 = m - i + 1; i__3 = i + 1; lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < n) { *A(i,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i + 1; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i,i), 1, dA(i-1,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i-1,i), ldda, dA(i-1,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i,1), &lda, A(i,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "N", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, X(i,1), &ldx, A(i,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = i - 1; i__3 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); /* Update A(i,i+1:n) */ i__2 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); lapackf77_dlacgv( &i, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, Y(i+1,1), &ldy, A(i,1), &lda, &c_one, A(i,i+1), &lda ); i__2 = i - 1; i__3 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i, A(i,1), &lda ); lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, X(i,1), &ldx, &c_one, A(i,i+1), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i; i__3 = i + 2; alpha = *A(i,i+1); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i,i+1) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, A(i,i+1), lda, dA(i-1,i), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i-1,i), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, Y(i+1,1), &ldy, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; blasf77_dgemv( "N", &i__2, &i, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i; blasf77_dgemv( "N", &i__2, &i__3, &c_one, A(1,i+1), &lda, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i != 0) { i__2 = m - i; blasf77_daxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i; lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, A(i,i+1), lda, dA(i-1,i), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i,i:n) */ i__2 = n - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); lapackf77_dlacgv( &i__3, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i,1), &ldy, A(i,1), &lda, &c_one, A(i,i), &lda ); i__2 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, A(i,1), &lda ); lapackf77_dlacgv( &i__3, X(i,1), &ldx ); #endif i__3 = n - i + 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i), &lda, X(i,1), &ldx, &c_one, A(i,i), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i + 1; i__3 = i + 1; alpha = *A(i,i); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < m) { *A(i,i) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, A(i,i), lda, dA(i-1,i-1), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i-1), ldda, dA(i-1,i-1), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, Y(i,1), &ldy, A(i,i), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i + 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_one, A(1,i), &lda, A(i,i), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2 != 0) { i__3 = m - i; blasf77_daxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); i__2 = n - i + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, Y(i,1), &ldy, &c_one, A(i+1,i), &ione ); i__2 = m - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, X(i+1,1), &ldx, A(1,i), &ione, &c_one, A(i+1,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i; i__3 = i + 2; alpha = *A(i+1,i); lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i+1,i), 1, dA(i,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i+1,1), &lda, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, X(i+1,1), &ldx, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i + 1; lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu( f ); return info; } /* magma_dlabrd_gpu */
extern "C" magma_int_t magmablas_dsymv_mgpu( magma_int_t num_gpus, magma_int_t k, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double alpha, double **dA, magma_int_t ldda, magma_int_t offset, double **dx, magma_int_t incx, double beta, double **dy, magma_int_t incy, double **dwork, magma_int_t ldwork, double *work, double *W, magma_queue_t stream[][10] ) { #define dX(id, i) (dx[(id)]+incx*(i)) #define dY(id, i, j) (dy[(id)]+incy*(i)+n*(j)) magma_int_t id; #ifdef MAGMABLAS_DSYMV_MGPU for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); magmablasSetKernelStream(stream[id][0]); trace_gpu_start( id, 0, "memset", "memset" ); cudaMemset( dwork[id], 0, ldwork*sizeof(double) ); trace_gpu_end( id, 0 ); trace_gpu_start( id, 0, "symv", "symv" ); } if ( nb == 32 ) { magmablas_dsymv_mgpu_32_offset( uplo, offset+n, alpha, dA, ldda, dx, incx, beta, dy, incy, dwork, ldwork, num_gpus, nb, offset, stream ); } else { magmablas_dsymv_mgpu_offset( uplo, offset+n, alpha, dA, ldda, dx, incx, beta, dy, incy, dwork, ldwork, num_gpus, nb, offset, stream ); } for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); trace_gpu_end( id, 0 ); magmablasSetKernelStream(NULL); } //magma_setdevice(0); //magmablasSetKernelStream(stream[0][0]); //magma_dsymv(MagmaLower, n, alpha, &dA[0][offset+offset*ldda], ldda, &dx[0][offset], incx, beta, &dy[0][offset], incy ); //magmablasSetKernelStream(NULL); /* send to CPU */ magma_setdevice(0); trace_gpu_start( 0, 0, "comm", "comm" ); magma_dgetvector_async( n, dY(0, offset, 0), 1, W, 1, stream[0][0] ); trace_gpu_end( 0, 0 ); magmablasSetKernelStream(NULL); for( id=1; id < num_gpus; id++ ) { magma_setdevice(id); trace_gpu_start( id, 0, "comm", "comm" ); magma_dgetvector_async( n, dY(id, offset, 0), 1, &work[id*n], 1, stream[id][0] ); trace_gpu_end( id, 0 ); magmablasSetKernelStream(NULL); } #else double c_one = MAGMA_D_ONE; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t i, ii, j, kk, ib, ib0, i_1, i_local, idw; magma_int_t i_0=n; magma_int_t loffset0 = nb*(offset/(nb*num_gpus)); magma_int_t loffset1 = offset%nb; magma_int_t loffset; //magma_dsymv(uplo, n, alpha, dA, ldda, dx, incx, beta, dy, incy ); idw = (offset/nb)%num_gpus; for( id=0; id < num_gpus; id++ ) { magma_setdevice(id); magmablasSetKernelStream(stream[id][0]); cudaMemset( dy[id], 0, n*k*sizeof(double) ); } if (uplo == MagmaLower) { /* the first block */ if ( loffset1 > 0 ) { id = idw; kk = 0; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); loffset = loffset0+loffset1; ib0 = min(nb-loffset1,n); // diagonal magma_dsymv(MagmaLower, ib0, c_one, dA(id, 0, 0 ), ldda, dX(id, 0), incx, c_one, dY(id, 0, kk), incy); // off-diagonl if ( ib0 < n ) { for( j=ib0; j < n; j += i_0 ) { i_1 = min(i_0, n-j); magma_dgemv(MagmaNoTrans, i_1, ib0, c_one, dA(id, j, 0), ldda, dX(id, 0), incx, c_one, dY(id, j, kk), incy); magma_dgemv(MagmaTrans, i_1, ib0, c_one, dA(id, j, 0), ldda, dX(id, j), incx, c_one, dY(id, 0, kk), incy); } } } else { ib0 = 0; } /* diagonal */ for( i=ib0; i < n; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = ((i+loffset1)/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = (i+loffset1)/(nb*num_gpus); ib = min(nb,n-i); ii = nb*i_local; loffset = loffset0; if ( id < idw ) loffset += nb; magma_dsymv(MagmaLower, ib, c_one, dA(id, i, ii), ldda, dX(id, i), incx, c_one, dY(id, i, kk), incy); } /* off-diagonal */ for( i=ib0; i < n-nb; i += nb ) { id = ((i+offset)/nb)%num_gpus; kk = ((i+loffset1)/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = ((i+loffset1)/nb)/num_gpus; ii = nb*i_local; ib = min(nb,n-i); loffset = loffset0; if ( id < idw ) loffset += nb; for( j=i+ib; j < n; j += i_0 ) { i_1 = min(i_0, n-j); magma_dgemv(MagmaNoTrans, i_1, ib, c_one, dA(id, j, ii), ldda, dX(id, i), incx, c_one, dY(id, j, kk), incy); magma_dgemv(MagmaTrans, i_1, ib, c_one, dA(id, j, ii), ldda, dX(id, j), incx, c_one, dY(id, i, kk), incy); } } } else { /* upper-triangular storage */ loffset = 0; /* diagonal */ for( i=0; i < n; i += nb ) { id = (i/nb)%num_gpus; kk = (i/(nb*num_gpus))%k; ib = min(nb,n-i); magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = i/(nb*num_gpus); ii = nb*i_local; magma_dsymv(MagmaUpper, ib, c_one, dA(id, i, ii), ldda, dX(id, i), incx, c_one, dY(id, i, kk), incy); } /* off-diagonal */ for( i=nb; i < n; i += nb ) { id = (i/nb)%num_gpus; kk = (i/(nb*num_gpus))%k; magma_setdevice(id); magmablasSetKernelStream(stream[id][kk]); i_local = (i/nb)/num_gpus; ii = nb*i_local; ib = min(nb,n-i); magma_dgemv(MagmaNoTrans, i, ib, c_one, dA(id, 0, ii), ldda, dX(id, i), incx, c_one, dY(id, 0, kk), incy); magma_dgemv(MagmaTrans, i, ib, c_one, dA(id, 0, ii), ldda, dX(id, 0), incx, c_one, dY(id, i, kk), incy); } } /* send to CPU */ magma_setdevice(0); magma_dgetvector_async( n, dY(0, 0, 0), 1, W, 1, stream[0][0] ); for( kk=1; kk < k; kk++ ) { magma_dgetvector_async( n, dY(0, 0, kk), 1, &work[kk*n], 1, stream[0][kk] ); } magmablasSetKernelStream(NULL); for( id=1; id < num_gpus; id++ ) { magma_setdevice(id); for( kk=0; kk < k; kk++ ) { magma_dgetvector_async( n, dY(id, 0, kk), 1, &work[id*k*n + kk*n], 1, stream[id][kk] ); } magmablasSetKernelStream(NULL); } #endif return 0; }