void SMCTest::test_itw_Lsodar() { initTwisting(); SP::ControlLsodarSimulation simLsodar(new ControlLsodarSimulation(_t0, _T, _h)); simLsodar->setSaveOnlyMainSimulation(true); simLsodar->addDynamicalSystem(_DS); simLsodar->addSensor(_sensor, _h); simLsodar->addActuator(_itw, _h); simLsodar->initialize(); simLsodar->run(); SimpleMatrix& data = *simLsodar->data(); ioMatrix::write("itw_Lsodar.dat", "ascii", data, "noDim"); // Reference Matrix SimpleMatrix dataRef(data); dataRef.zero(); ioMatrix::read("itw.ref", "ascii", dataRef); // it is a bad idea to compare solutions to an AVI that does not admit a unique solution SiconosVector lambda1 = SiconosVector(data.size(0)); SiconosVector lambda2 = SiconosVector(data.size(0)); data.getCol(3, lambda1); data.getCol(4, lambda2); axpy(_beta, lambda2, lambda1); SiconosVector lambda1Ref = SiconosVector(data.size(0)); SiconosVector lambda2Ref = SiconosVector(data.size(0)); dataRef.getCol(3, lambda1Ref); dataRef.getCol(4, lambda2Ref); axpy(_beta, lambda2Ref, lambda1Ref); data.setCol(3, lambda1); dataRef.setCol(3, lambda1Ref); data.resize(data.size(0), 4); dataRef.resize(data.size(0), 4); std::cout << "------- Integration done, error = " << (data - dataRef).normInf() << " -------" <<std::endl; CPPUNIT_ASSERT_EQUAL_MESSAGE("test_itw_Lsodar : ", (data - dataRef).normInf() < _tol, true); }
void backwardOneInput(int layerId, const UpdateCallback& callback) { const MatrixPtr& inputMat = getInputValue(layerId); const MatrixPtr& inputGradMat = getInputGrad(layerId); const MatrixPtr& weightMat = weights_[layerId]->getW(); const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad(); int dim = inputMat->getWidth(); real* sampleGrad = sampleOut_.grad->getData(); if (weightGradMat) { for (size_t i = 0; i < samples_.size(); ++i) { axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId), weightGradMat->getRowBuf(samples_[i].labelId)); } weights_[layerId]->incUpdate(callback); } if (inputGradMat) { for (size_t i = 0; i < samples_.size(); ++i) { axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId), inputGradMat->getRowBuf(samples_[i].sampleId)); } } }
void cg(eval_t A, Matrix b, double tolerance, void* ctx) { Matrix r = createMatrix(b->rows, b->cols); Matrix p = createMatrix(b->rows, b->cols); Matrix buffer = createMatrix(b->rows, b->cols); double dotp = 1000; double rdr = dotp; copyVector(r->as_vec,b->as_vec); fillVector(b->as_vec, 0.0); int i=0; while (i < b->as_vec->len && rdr > tolerance) { ++i; if (i == 1) { copyVector(p->as_vec,r->as_vec); dotp = innerproduct(r->as_vec,r->as_vec); } else { double dotp2 = innerproduct(r->as_vec,r->as_vec); double beta = dotp2/dotp; dotp = dotp2; scaleVector(p->as_vec,beta); axpy(p->as_vec,r->as_vec,1.0); } A(buffer,p,ctx); double alpha = dotp/innerproduct(p->as_vec,buffer->as_vec); axpy(b->as_vec,p->as_vec,alpha); axpy(r->as_vec,buffer->as_vec,-alpha); rdr = sqrt(innerproduct(r->as_vec,r->as_vec)); } printf("%i iterations\n",i); freeMatrix(r); freeMatrix(p); freeMatrix(buffer); }
void Basic_Agent::set_internal_uptake_constants( double dt ) { // overall form: dp/dt = S*(T-p) - U*p // p(n+1) - p(n) = dt*S(n)*T(n) - dt*( S(n) + U(n))*p(n+1) // p(n+1)*temp2 = p(n) + temp1 // p(n+1) = ( p(n) + temp1 )/temp2 //int nearest_voxel= current_voxel_index; double internal_constant_to_discretize_the_delta_approximation = dt * volume / ( (microenvironment->voxels(current_voxel_index)).volume ) ; // needs a fix // before the fix on September 28, 2015. Also, switched on this day // from Delta function sources/sinks to volumetric /* // temp1 = dt*S*T cell_source_sink_solver_temp1 = *secretion_rates; cell_source_sink_solver_temp1 *= *saturation_densities; cell_source_sink_solver_temp1 *= dt; // temp2 = 1 + dt*( S + U ) cell_source_sink_solver_temp2.assign( (*secretion_rates).size() , 1.0 ); axpy( &(cell_source_sink_solver_temp2) , dt , *secretion_rates ); axpy( &(cell_source_sink_solver_temp2) , dt , *uptake_rates ); */ // temp1 = dt*(V_cell/V_voxel)*S*T cell_source_sink_solver_temp1.assign( (*secretion_rates).size() , 0.0 ); cell_source_sink_solver_temp1 += *secretion_rates; cell_source_sink_solver_temp1 *= *saturation_densities; cell_source_sink_solver_temp1 *= internal_constant_to_discretize_the_delta_approximation; // temp2 = 1 + dt*(V_cell/V_voxel)*( S + U ) cell_source_sink_solver_temp2.assign( (*secretion_rates).size() , 1.0 ); axpy( &(cell_source_sink_solver_temp2) , internal_constant_to_discretize_the_delta_approximation , *secretion_rates ); axpy( &(cell_source_sink_solver_temp2) , internal_constant_to_discretize_the_delta_approximation , *uptake_rates ); }
void CBOW_NS::update(uint64_t cur, const vector<uint64_t>& context, real alpha) { _in.clear(); for (vector<uint64_t>::const_iterator i = context.begin(); i != context.end(); ++i) { _in.push_back(_net.get_input_vec(*i)); } _out.clear(); _out.push_back(_net.get_output_vec(cur)); for (unsigned i = 0; i < _neg_sample_cnt; ++i) { _out.push_back(_net.get_output_vec(_vocab.sampling(&_seed))); } memset(_hidden_vec, 0, sizeof(real) * _sz); for (vector<const real*>::const_iterator i = _in.begin(); i != _in.end(); ++i) { addto(_sz, *i, _hidden_vec); } scale(_sz, static_cast<real>(1.0) / _in.size(), _hidden_vec); memset(_hidden_vec_grad, 0, sizeof(real) * _sz); for (size_t i = 0; i != _out.size(); ++i) { real sigma = sigmoid(dot(_sz, _hidden_vec, _out[i])); int label = (i==0) ? 1 : 0; real coef = (label - sigma) * alpha; axpy(_sz, coef, _out[i], _hidden_vec_grad); axpy(_sz, coef, _hidden_vec, const_cast<real*>(_out[i])); } scale(_sz, static_cast<real>(1.0) / _in.size(), _hidden_vec_grad); for (size_t i = 0; i != _in.size(); ++i) { addto(_sz, _hidden_vec_grad, const_cast<real*>(_in[i])); } }
void ElecMinimizer::step(const ElecGradient& dir, double alpha) { myassert(dir.eInfo == &eInfo); for(int q=eInfo.qStart; q<eInfo.qStop; q++) { if(dir.Y[q]) axpy(alpha, dir.Y[q], eVars.Y[q]); if(dir.B[q]) axpy(alpha, dir.B[q], eVars.B[q]); } }
int cg(Matrix A, Vector b, double tolerance) { int i=0, j; double rl; Vector r = createVector(b->len); Vector p = createVector(b->len); Vector buffer = createVector(b->len); double dotp = 1000; double rdr = dotp; copyVector(r,b); fillVector(b, 0.0); rl = sqrt(dotproduct(r,r)); while (i < b->len && rdr > tolerance*rl) { ++i; if (i == 1) { copyVector(p,r); dotp = dotproduct(r,r); } else { double dotp2 = dotproduct(r,r); double beta = dotp2/dotp; dotp = dotp2; scaleVector(p,beta); axpy(p,r,1.0); } MxV(buffer, p); double alpha = dotp/dotproduct(p,buffer); axpy(b,p,alpha); axpy(r,buffer,-alpha); rdr = sqrt(dotproduct(r,r)); } freeVector(r); freeVector(p); freeVector(buffer); return i; }
void axpy(double alpha, const ElecGradient& x, ElecGradient& y) { myassert(x.eInfo == y.eInfo); for(int q=x.eInfo->qStart; q<x.eInfo->qStop; q++) { if(x.Y[q]) { if(y.Y[q]) axpy(alpha, x.Y[q], y.Y[q]); else y.Y[q] = alpha*x.Y[q]; } if(x.B[q]) { if(y.B[q]) axpy(alpha, x.B[q], y.B[q]); else y.B[q] = alpha*x.B[q]; } } }
void CBOW_HS::update(uint64_t cur, const vector<uint64_t>& context, real alpha) { _in.clear(); for (vector<uint64_t>::const_iterator i = context.begin(); i != context.end(); ++i) { _in.push_back(_net.get_input_vec(*i)); } memset(_hidden_vec, 0, sizeof(real) * _sz); for (vector<const real*>::const_iterator i = _in.begin(); i != _in.end(); ++i) { addto(_sz, *i, _hidden_vec); } scale(_sz, static_cast<real>(1.0) / _in.size(), _hidden_vec); const vector<char>& code = _huffman_tree.code(cur); const vector<uint64_t>& path = _huffman_tree.path(cur); memset(_hidden_vec_grad, 0, sizeof(real) * _sz); for (size_t i = 0; i != path.size(); ++i) { real* out = _net.get_output_vec(path[i]); real sigma = sigmoid(dot(_sz, _hidden_vec, out)); int label = code[i]; real coef = (label - sigma) * alpha; axpy(_sz, coef, out, _hidden_vec_grad); axpy(_sz, coef, _hidden_vec, out); } scale(_sz, static_cast<real>(1.0) / _in.size(), _hidden_vec_grad); for (size_t i = 0; i != _in.size(); ++i) { addto(_sz, _hidden_vec_grad, const_cast<real*>(_in[i])); } }
void bi::mean(const UniformPdf<V1>& q, V2 mu) { /* pre-condition */ BI_ASSERT(q.size() == mu.size()); axpy(0.5, q.lower(), mu, true); axpy(0.5, q.upper(), mu); }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(latparams lp, field **x, field **b, link **g) { size_t L = lp.L; int max_iter = 100; float tol = 1e-9; /* Temporary fields needed for CG */ field **r = new_field(lp); field **p = new_field(lp); field **Ap = new_field(lp); /* Initial residual and p-vector */ lapl(lp, r, x, g); xmy(lp, b, r); xeqy(lp, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(lp, r); float bb = xdotx(lp, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(lp, Ap, p, g); t_lapl += stop_watch(t); float pAp = xdoty(lp, p, Ap); float alpha = rr/pAp; axpy(lp, alpha, p, x); axpy(lp, -alpha, Ap, r); float r1r1 = xdotx(lp, r); float beta = r1r1/rr; xpay(lp, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(lp, r, x, g); xmy(lp, b, r); rr = xdotx(lp, r); double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); del_field(r); del_field(p); del_field(Ap); return; }
void cgsolve( const CrsMatrix<AScalarType,Device> & A , const View<VScalarType*,LayoutRight,Device> & b , const View<VScalarType*,LayoutRight,Device> & x , size_t & iteration , double & normr , double & iter_time , const size_t maximum_iteration = 200 , const double tolerance = std::numeric_limits<VScalarType>::epsilon() ) { typedef View<VScalarType*,LayoutRight,Device> vector_type ; const size_t count = b.dimension_0(); vector_type p ( "cg::p" , count ); vector_type r ( "cg::r" , count ); vector_type Ap( "cg::Ap", count ); /* r = b - A * x ; */ /* p = x */ deep_copy( p , x ); /* Ap = A * p */ multiply( A , p , Ap ); /* r = b - Ap */ waxpby( count , 1.0 , b , -1.0 , Ap , r ); /* p = r */ deep_copy( p , r ); double old_rdot = dot( count , r ); normr = std::sqrt( old_rdot ); iteration = 0 ; Kokkos::Impl::Timer wall_clock ; while ( tolerance < normr && iteration < maximum_iteration ) { /* pAp_dot = dot( p , Ap = A * p ) */ /* Ap = A * p */ multiply( A , p , Ap ); const double pAp_dot = dot( count , p , Ap ); const double alpha = old_rdot / pAp_dot ; /* x += alpha * p ; */ axpy( count, alpha, p , x ); /* r -= alpha * Ap ; */ axpy( count, -alpha, Ap, r ); const double r_dot = dot( count , r ); const double beta = r_dot / old_rdot ; /* p = r + beta * p ; */ xpby( count , r , beta , p ); normr = std::sqrt( old_rdot = r_dot ); ++iteration ; } iter_time = wall_clock.seconds(); }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u) { int max_iter = 100; float tol = 1e-6; /* Temporary fields needed for CG */ _Complex float *r = new_field(L); _Complex float *p = new_field(L); _Complex float *Ap = new_field(L); /* Initial residual and p-vector */ lapl(L, r, x, u); xmy(L, b, r); xeqy(L, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(L, r); float bb = xdotx(L, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(L, Ap, p, u); t_lapl += stop_watch(t); float pAp = xdoty(L, p, Ap); float alpha = rr/pAp; axpy(L, alpha, p, x); axpy(L, -alpha, Ap, r); float r1r1 = xdotx(L, r); float beta = r1r1/rr; xpay(L, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(L, r, x, u); xmy(L, b, r); rr = xdotx(L, r); double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9; double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); free(r); free(p); free(Ap); return; }
int GaussJacobiPoisson1D(Vector u, double tol, int maxit) { int it=0, i; Vector b = cloneVector(u); Vector e = cloneVector(u); copyVector(b, u); fillVector(u, 0.0); double max = tol+1; while (max > tol && ++it < maxit) { copyVector(e, u); collectVector(e); copyVector(u, b); #pragma omp parallel for schedule(static) for (i=1;i<e->len-1;++i) { u->data[i] += e->data[i-1]; u->data[i] += e->data[i+1]; u->data[i] /= (2.0+alpha); } axpy(e, u, -1.0); e->data[0] = e->data[e->len-1] = 0.0; max = maxNorm(e); } freeVector(b); freeVector(e); return it; }
int GaussJacobiPoisson1D(Vector u, double tol, int maxit) { int it=0, i; double rl; double max = tol+1; Vector b = createVector(u->len); Vector e = createVector(u->len); copyVector(b, u); fillVector(u, 0.0); rl = maxNorm(b); while (max > tol*rl && ++it < maxit) { copyVector(e, u); copyVector(u, b); #pragma omp parallel for schedule(static) for (i=0;i<e->len;++i) { if (i > 0) u->data[i] += e->data[i-1]; if (i < e->len-1) u->data[i] += e->data[i+1]; u->data[i] /= (2.0+alpha); } axpy(e, u, -1.0); max = maxNorm(e); } freeVector(b); freeVector(e); return it; }
/* ************************************************************************* */ VectorValues DoglegOptimizerImpl::ComputeBlend(double delta, const VectorValues& x_u, const VectorValues& x_n, const bool verbose) { // See doc/trustregion.lyx or doc/trustregion.pdf // Compute inner products const double un = dot(x_u, x_n); const double uu = dot(x_u, x_u); const double nn = dot(x_n, x_n); // Compute quadratic formula terms const double a = uu - 2.*un + nn; const double b = 2. * (un - uu); const double c = uu - delta*delta; double sqrt_b_m4ac = std::sqrt(b*b - 4*a*c); // Compute blending parameter double tau1 = (-b + sqrt_b_m4ac) / (2.*a); double tau2 = (-b - sqrt_b_m4ac) / (2.*a); double tau; if(0.0 <= tau1 && tau1 <= 1.0) { assert(!(0.0 <= tau2 && tau2 <= 1.0)); tau = tau1; } else { assert(0.0 <= tau2 && tau2 <= 1.0); tau = tau2; } // Compute blended point if(verbose) cout << "In blend region with fraction " << tau << " of Newton's method point" << endl; VectorValues blend = (1. - tau) * x_u; axpy(tau, x_n, blend); return blend; }
void bi::marginalise(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2,M2>& p2, const M3 C, const ExpGaussianPdf<V4, M4>& q2, ExpGaussianPdf<V5,M5>& p3) { /* pre-conditions */ BI_ASSERT(q2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); typename sim_temp_matrix<M1>::type A1(p2.size(), p2.size()); typename sim_temp_matrix<M1>::type A2(p2.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + * \mathcal{K}(\boldsymbol{\mu}_3 - \boldsymbol{\mu}_2)\,,\f] */ z2 = q2.mean(); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 + \mathcal{K}(\Sigma_3 - * \Sigma_2)\mathcal{K}^T \\ * &=& \Sigma_1 + \mathcal{K}\Sigma_3\mathcal{K}^T - * \mathcal{K}\Sigma_2\mathcal{K}^T\,. * \f} */ p3.cov() = p1.cov(); A1 = K; trmm(1.0, q2.std(), A1, 'R', 'U', 'T'); syrk(1.0, A1, 1.0, p3.cov(), 'U'); A2 = K; trmm(1.0, p2.std(), A2, 'R', 'U', 'T'); syrk(-1.0, A2, 1.0, p3.cov(), 'U'); /* make sure correct log-variables set */ p3.setLogs(p2.getLogs()); p3.init(); // redo precalculations }
void BrooksCorey2p::readZones(ParameterDatabase& pd,Petsc::SecondOrderFd& node) { int nZones = pd.i("nZones"); real dx=0.0,dy=0.0,dz=0.0; dx=(pd.r("xRight") - pd.r("xLeft")) / (real(pd.i("nxNodes"))-1); if (pd.i("nyNodes") > 1) dy=(pd.r("yBack") - pd.r("yFront")) / (real(pd.i("nyNodes"))-1); if (pd.i("nzNodes") > 1) dz=(pd.r("zTop") - pd.r("zBottom")) / (real(pd.i("nzNodes"))-1); real gravity=pd.r("gravity"), // m/d^2 density=pd.r("rhoW"), // kg / m^3 viscosity=pd.r("muW"); // kg /m d for (int i=node.local_z0; i<node.local_z0+node.local_nzNodes; i++) for (int j=node.local_y0; j<node.local_y0+node.local_nyNodes; j++) for (int k=node.local_x0; k<node.local_x0+node.local_nxNodes; k++) { node(i,j,k); real x=k*dx,y=j*dy,z=i*dz; for (int n=0; n<nZones; n++) { if (x >= pd.v("zoneLeft")(n) && x < pd.v("zoneRight")(n) && y >= pd.v("zoneFront")(n) && y < pd.v("zoneBack")(n) && z >= pd.v("zoneBottom")(n) && z < pd.v("zoneTop")(n) ) { global_psiD(node.center) = pd.v("pdZone")(n); global_lambda(node.center) = pd.v("lambdaZone")(n); global_KWs(node.center) = pd.v("permZone")(n)*gravity*density/viscosity; global_thetaS(node.center) = pd.v("thetaS_Zone")(n); global_thetaR(node.center) = pd.v("thetaR_Zone")(n); } } } // global_thetaSR = global_thetaS - global_thetaR; global_thetaSR = global_thetaS; axpy(-1.0,global_thetaR,global_thetaSR); // std::cout<<global_psiD<<std::endl // <<global_lambda<<std::endl // <<global_thetaS<<std::endl // <<global_KWs<<std::endl; psiD.startSetFromGlobal(global_psiD); psiD.endSetFromGlobal(global_psiD); lambda.startSetFromGlobal(global_lambda); lambda.endSetFromGlobal(global_lambda); KWs.startSetFromGlobal(global_KWs); KWs.endSetFromGlobal(global_KWs); thetaS.startSetFromGlobal(global_thetaS); thetaS.endSetFromGlobal(global_thetaS); thetaR.startSetFromGlobal(global_thetaR); thetaR.endSetFromGlobal(global_thetaR); thetaSR.startSetFromGlobal(global_thetaSR); thetaSR.endSetFromGlobal(global_thetaSR); }
int main(int argc, char** argv) { size_t pow = read_arg(argc, argv, 1, 16); size_t n = 1 << pow; size_t size_in_bytes = n * sizeof(double); std::cout << "memcopy and daxpy test of size " << n << std::endl; double* x = malloc_host<double>(n, 1.5); double* y = malloc_host<double>(n, 3.0); #ifdef FLUSH_CACHE // use dummy fields to avoid cache effects, which make results harder to interpret // use 1<<24 to ensure that cache is completely purged for all n double* x_ = malloc_host<double>(1<<24, 1.5); double* y_ = malloc_host<double>(1<<24, 3.0); axpy(1<<24, 2.0, x_, y_); #endif double start = get_time(); axpy(n, 2.0, x, y); double time_axpy = get_time() - start; std::cout << "-------\ntimings\n-------" << std::endl; std::cout << "axpy : " << time_axpy << " s" << std::endl; std::cout << std::endl; // check for errors int errors = 0; for(int i=0; i<n; ++i) { if(std::fabs(6.-y[i])>1e-15) { errors++; } } if(errors>0) std::cout << "\n============ FAILED with " << errors << " errors" << std::endl; else std::cout << "\n============ PASSED" << std::endl; free(x); free(y); return 0; }
void SkipGram_HS::update(uint64_t cur, const vector<uint64_t>& context, real alpha) { real* hidden_vec = _net.get_input_vec(cur); memset(_hidden_vec_grad, 0, sizeof(real) * _sz); for (vector<uint64_t>::const_iterator it = context.begin(); it != context.end(); ++it) { const vector<char>& code = _huffman_tree.code(*it); const vector<uint64_t>& path = _huffman_tree.path(*it); for (size_t i = 0; i != path.size(); ++i) { real* out = _net.get_output_vec(path[i]); real sigma = sigmoid(dot(_sz, hidden_vec, out)); int label = code[i]; real coef = (label - sigma) * alpha; axpy(_sz, coef, out, _hidden_vec_grad); // not exactly the same as the original axpy(_sz, coef, hidden_vec, out); } } addto(_sz, _hidden_vec_grad, hidden_vec); }
void SkipGram_NS::update(uint64_t cur, const vector<uint64_t>& context, real alpha) { const real* hidden_vec = _net.get_input_vec(cur); memset(_hidden_vec_grad, 0, sizeof(real) * _sz); for (vector<uint64_t>::const_iterator i = context.begin(); i != context.end(); ++i) { _out.clear(); _out.push_back(_net.get_output_vec(*i)); for (unsigned j = 0; j < _neg_sample_cnt; ++j) { _out.push_back(_net.get_output_vec(_vocab.sampling(&_seed))); } for (size_t i = 0; i != _out.size(); ++i) { real sigma = sigmoid(dot(_sz, hidden_vec, _out[i])); int label = (i==0) ? 1 : 0; real coef = (label - sigma) * alpha; axpy(_sz, coef, _out[i], _hidden_vec_grad); axpy(_sz, coef, hidden_vec, const_cast<real*>(_out[i])); } } addto(_sz, _hidden_vec_grad, const_cast<real*>(hidden_vec)); }
void bi::cov(const UniformPdf<V1>& q, M1 Sigma) { /* pre-condition */ BI_ASSERT(Sigma.size1() == q.size()); BI_ASSERT(Sigma.size2() == q.size()); temp_host_vector<real>::type diff(q.size()); diff = q.upper(); sub_elements(diff, q.lower(), diff); sq_elements(diff, diff); Sigma.clear(); axpy(1.0/12.0, diff, diagonal(Sigma)); }
void RosenbrockDaeDefinition::correctArgument(Vec& correction) { yLast = yDaeDef; Flast = Fcurrent; updateJac=true; updateF=true; #ifndef USE_BLAS yDaeDef-=correction; #else axpy(-1.0,correction,yDaeDef); #endif }
int main() { double arrayOnStack[10]; stackArray = arrayOnStack; createNewArray(); createMallocArray(); setArray(stackArray); setArray(newArray - 1); setArray(mallocArray - 1); printArray(newArray); printArray(mallocArray); axpy(newArray, stackArray); axpy(newArray, mallocArray); // delete stackArray; stackArray = 0; newArray = 0; mallocArray = 0; return 0; }
int main(int argc, char** argv) { size_t pow = read_arg(argc, argv, 1, 16); size_t n = 1 << pow; std::cout << "memcopy and daxpy test of size " << n << "\n"; double* x = malloc_host<double>(n, 1.5); double* y = malloc_host<double>(n, 3.0); // use dummy fields to avoid cache effects, which make results harder to // interpret use 1<<24 to ensure that cache is completely purged for all n double* x_ = malloc_host<double>(n, 1.5); double* y_ = malloc_host<double>(n, 3.0); // openmp version: auto start = get_time(); axpy(n, 2.0, x_, y_); auto time_axpy_omp = get_time() - start; // openacc version: start = get_time(); axpy_gpu(n, 2.0, x, y); auto time_axpy_gpu = get_time() - start; std::cout << "-------\ntimings\n-------\n"; std::cout << "axpy (openmp): " << time_axpy_omp << " s\n"; std::cout << "axpy (openacc): " << time_axpy_gpu << " s\n"; // check for errors auto errors = 0; #pragma omp parallel for reduction(+:errors) for (auto i = 0; i < n; ++i) { if (std::fabs(6.-y[i]) > 1e-15) { ++errors; } } if (errors > 0) { std::cout << "\n============ FAILED with " << errors << " errors\n"; } else { std::cout << "\n============ PASSED\n"; } free(x); free(y); return 0; }
void bi::condition(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2, M2>& p2, const M3 C, const V3 x2, ExpGaussianPdf<V4, M4>& p3) { /* pre-condition */ BI_ASSERT(x2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + \mathcal{K}(\mathbf{x}_2 - * \boldsymbol{\mu}_2)\,,\f] */ z2 = x2; log_vector(z2, p2.getLogs()); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 - \mathcal{K}C_{\mathbf{x}_1,\mathbf{x}_2}^T \\ * &=& \Sigma_1 - C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1} * C_{\mathbf{x}_1,\mathbf{x}_2}^T\,.\f} */ K = C; trsm(1.0, p2.std(), K, 'R', 'U'); p3.cov() = p1.cov(); syrk(-1.0, K, 1.0, p3.cov(), 'U'); /* update log-variables and precalculations */ p3.setLogs(p1.getLogs()); p3.init(); }
clsparseStatus cldenseDaxpy(cldenseVector *r, const clsparseScalar *alpha, const cldenseVector *x, const cldenseVector *y, const clsparseControl control) { if (!clsparseInitialized) { return clsparseNotInitialized; } //check opencl elements if (control == nullptr) { return clsparseInvalidControlObject; } clsparse::vector<cl_double> pR (control, r->values, r->num_values); clsparse::vector<cl_double> pAlpha(control, alpha->value, 1); clsparse::vector<cl_double> pX (control, x->values, x->num_values); clsparse::vector<cl_double> pY (control, y->values, y->num_values); assert(pR.size() == pY.size()); assert(pR.size() == pX.size()); cl_ulong size = pR.size(); if(size == 0) return clsparseSuccess; //nothing to do if (pAlpha[0] == 0.0) { auto pRBuff = pR.data()(); auto pYBuff = pY.data()(); //if R is different pointer than Y than copy Y to R if (pRBuff != pYBuff) { // deep copy; pR = pY; } return clsparseSuccess; } return axpy(pR, pAlpha, pX, pY, control); }
void bi::distance(const M1 X, const real h, M2 D) { /* pre-conditions */ BI_ASSERT(D.size1() == D.size2()); BI_ASSERT(D.size1() == X.size1()); BI_ASSERT(!M2::on_device); typedef typename M1::value_type T1; FastGaussianKernel K(X.size2(), h); typename temp_host_vector<T1>::type d(X.size2()); int i, j; for (j = 0; j < D.size2(); ++j) { for (i = 0; i <= j; ++i) { d = row(X, i); axpy(-1.0, row(X, j), d); D(i, j) = K(dot(d)); } } }
void GS(Matrix u, double tolerance, int maxit) { int it=0; Matrix b = cloneMatrix(u); Matrix e = cloneMatrix(u); Matrix v = cloneMatrix(u); int* sizes, *displ; splitVector(u->rows-2, 2*max_threads(), &sizes, &displ); copyVector(b->as_vec, u->as_vec); fillVector(u->as_vec, 0.0); double max = tolerance+1; while (max > tolerance && ++it < maxit) { copyVector(e->as_vec, u->as_vec); copyVector(u->as_vec, b->as_vec); for (int color=0;color<2;++color) { for (int i=1;i<u->cols-1;++i) { #pragma omp parallel { int cnt=displ[get_thread()*2+color]+1; for (int j=0;j<sizes[get_thread()*2+color];++j, ++cnt) { u->data[i][cnt] += v->data[i][cnt-1]; u->data[i][cnt] += v->data[i][cnt+1]; u->data[i][cnt] += v->data[i-1][cnt]; u->data[i][cnt] += v->data[i+1][cnt]; u->data[i][cnt] /= 4.0; v->data[i][cnt] = u->data[i][cnt]; } } } } axpy(e->as_vec, u->as_vec, -1.0); max = sqrt(innerproduct(e->as_vec, e->as_vec)); } printf("number of iterations %i %f\n", it, max); freeMatrix(b); freeMatrix(e); freeMatrix(v); free(sizes); free(displ); }
bool RosenbrockDaeDefinition::numericalJacVec(const Vec& v, Vec& Jv) { bool evalError=false; real delFac=1.0e-5; value(evalError);//sets Fcurrent if (evalError) return evalError; else { //cek del = -delFac*v; del = v; scal(-delFac,del); //end cek correctArgument(del);//sets Flast to Fcurrent -- //remember correction is -del = delFac*v value(evalError); while (evalError) { std::cerr<<"cutting back on del in numerical jac vec"<<std::endl; unCorrect();//sets Flast to Fcurrent delFac*=0.1; //cek del = -delFac*v; del = v; scal(-delFac,del); //end cek correctArgument(del); value(evalError);//sets Fcurrent } //cek Jv = (Fcurrent - Flast)/delFac; Jv = Fcurrent; axpy(-1.0,Flast,Jv); scal(delFac,Jv); //end cek unCorrect(); } return evalError; }