GLfloat* rotate(GLfloat *vecDest, GLfloat *vecSource, GLfloat *rot, GLfloat phi) { GLfloat result1[3], result2[3], result3[3], result4[3], result5[3], result6[3], result7[3]; add(add(scalMult(scalProd(rot, vecSource), rot, result1), scalMult(cos(phi*M_PI/180), vecProd( vecProd(rot, vecSource, result2), rot, result3), result4), result5), scalMult(sin(phi*M_PI/180), vecProd(rot, vecSource, result6), result7), vecDest); return vecDest; }
template <typename F> F GpuBicgStab<F>::iteration (csize_t nr, UNUSED std::ostream& log, bool profilingRun, Core::ProfilingDataPtr prof) { const std::vector<cl::CommandQueue>& queues = this->queues (); const cl::CommandQueue& queue = queues[0]; DipVector<ftype>& rvec = this->rvec (); DipVector<ftype>& Avecbuffer = this->Avecbuffer (); DipVector<ftype>& xvec = this->xvec (); DipVector<ftype>& pvec = this->tmpVec1 (); DipVector<ftype>& v = this->tmpVec2 (); DipVector<ftype>& s = this->tmpVec3 (); DipVector<ftype>& rtilda = this->tmpVec4 (); (vars + &Vars::ro_new).write (queue, vecProd (queues, rvec, rtilda)); // Use higher precision to avoid underflow / overflow ftype dtmp = static_cast<ftype> (std::abs<ldouble> ((vars + &Vars::ro_new).read (queue)) / this->inprodR ()); ASSERT (dtmp >= 1e-16 || profilingRun); if (nr == 0) { this->linComb.linComb (queues, rvec, pvec); } else { // Use higher precision to avoid underflow / overflow cldouble ro_new_alpha = (vars + &Vars::ro_new).read (queue) * (vars + &Vars::alpha).read (queue); cldouble ro_old_omega = (vars + &Vars::ro_old).read (queue) * (vars + &Vars::omega).read (queue); ASSERT (std::abs (ro_old_omega) / std::abs (ro_new_alpha) >= 10e-10 || profilingRun); (vars + &Vars::beta).write (queue, static_cast<ctype> (ro_new_alpha / ro_old_omega)); (vars + &Vars::mBetaOmega).write (queue, -(vars + &Vars::beta).read (queue) * (vars + &Vars::omega).read (queue)); this->linComb.linComb (queues, pvec, vars + &Vars::beta, v, vars + &Vars::mBetaOmega, rvec, pvec); } this->matVec ().apply (queues, pvec, v, false, prof); // Use higher precision to avoid underflow / overflow cldouble ro_new = (vars + &Vars::ro_new).read (queue); cldouble vRtilda = vecProd (queues, v, rtilda); (vars + &Vars::alpha).write (queue, static_cast<ctype> (ro_new / vRtilda)); (vars + &Vars::mAlpha).write (queue, -(vars + &Vars::alpha).read (queue)); this->linComb.linComb (queues, v, vars + &Vars::mAlpha, rvec, s); this->linComb.reduce (queues, s, vars + &Vars::norm); ftype inprodRplus1 = (vars + &Vars::norm).read (queue); if (inprodRplus1 < this->epsB && !profilingRun) { this->linComb.linComb (queues, pvec, vars + &Vars::alpha, xvec, xvec); } else { this->matVec ().apply (queues, s, Avecbuffer, false, prof); this->linComb.reduce (queues, Avecbuffer, vars + &Vars::denumOmega); // Use higher precision to avoid underflow / overflow //(vars + &Vars::omega).write (queue, vecProd (queues, s, Avecbuffer) / (vars + &Vars::denumOmega).read (queue)); (vars + &Vars::omega).write (queue, static_cast<ctype> (static_cast<cldouble> (vecProd (queues, s, Avecbuffer)) / static_cast<ldouble> ((vars + &Vars::denumOmega).read (queue)))); this->linComb.linComb (queues, pvec, vars + &Vars::alpha, s, vars + &Vars::omega, xvec, xvec); (vars + &Vars::mOmega).write (queue, -(vars + &Vars::omega).read (queue)); this->linComb.linComb (queues, Avecbuffer, vars + &Vars::mOmega, s, rvec); this->linComb.reduce (queues, rvec, vars + &Vars::norm); inprodRplus1 = (vars + &Vars::norm).read (queue); (vars + &Vars::ro_old).write (queue, (vars + &Vars::ro_new).read (queue)); } return inprodRplus1; }