void generate_reference() { m_repo.init_field("ccol", -1.0); m_repo.init_field("dcol", -1.0); m_repo.init_field("datacol", -1.0); Real *ccol = m_repo.field_h("ccol"); Real *dcol = m_repo.field_h("dcol"); Real *datacol = m_repo.field_h("datacol"); u_stage_ = m_repo.field_h("u_stage"); wcon_ = m_repo.field_h("wcon"); u_pos_ = m_repo.field_h("u_pos"); utens_ = m_repo.field_h("utens"); utens_stage_ref_ = m_repo.field_h("utens_stage_ref"); v_stage_ = m_repo.field_h("v_stage"); v_pos_ = m_repo.field_h("v_pos"); vtens_ = m_repo.field_h("vtens"); vtens_stage_ref_ = m_repo.field_h("vtens_stage_ref"); w_stage_ = m_repo.field_h("w_stage"); w_pos_ = m_repo.field_h("w_pos"); wtens_ = m_repo.field_h("wtens"); wtens_stage_ref_ = m_repo.field_h("wtens_stage_ref"); // Generate U for (int i = m_halo.m_i; i < m_domain.m_i - m_halo.m_i; ++i) { for (int j = m_halo.m_j; j < m_domain.m_j - m_halo.m_j; ++j) { forward_sweep( i, j, 0,0, 8,8, 1, 0, ccol, dcol, wcon_, u_stage_, u_pos_, utens_, utens_stage_ref_, m_domain, m_strides); backward_sweep(i, j, 0,0,8,8,ccol, dcol, datacol, u_pos_, utens_stage_ref_, m_domain, m_strides); } } // Generate V for (int i = m_halo.m_i; i < m_domain.m_i - m_halo.m_i; ++i) { for (int j = m_halo.m_j; j < m_domain.m_j - m_halo.m_j; ++j) { forward_sweep( i, j, 0,0, 8,8, 1, 0, ccol, dcol, wcon_, v_stage_, v_pos_, vtens_, vtens_stage_ref_, m_domain, m_strides); backward_sweep(i, j, 0,0,8,8,ccol, dcol, datacol, v_pos_, vtens_stage_ref_, m_domain, m_strides); } } // Generate W for (int i = m_halo.m_i; i < m_domain.m_i - m_halo.m_i; ++i) { for (int j = m_halo.m_j; j < m_domain.m_j - m_halo.m_j; ++j) { forward_sweep( i, j, 0,0, 8,8, 1, 0, ccol, dcol, wcon_, w_stage_, w_pos_, wtens_, wtens_stage_ref_, m_domain, m_strides); backward_sweep(i, j, 0,0,8,8,ccol, dcol, datacol, w_pos_, wtens_stage_ref_, m_domain, m_strides); } } }
int main(int argc, char* argv[]) { if(argc != 2) { std::cerr << "Usage : " << argv[0] << "<0,1>" <<std::endl; std::cerr << "with : " << std::endl; std::cerr << "0 : quadratic loss" << std::endl; std::cerr << "1 : cross entropy loss" << std::endl; return -1; } bool quadratic_loss = (atoi(argv[1]) == 0); srand(time(NULL)); // We compare our computation of the gradient to // a finite difference approximation // The loss is also involved std::cout << "---------------------------------" << std::endl; std::cout << "Comparing the analytical gradient and numerical approximation " << std::endl; auto input = gaml::mlp::input<X>(INPUT_DIM, fillInput); auto l1 = gaml::mlp::layer(input, HIDDEN_LAYER_SIZE, gaml::mlp::mlp_sigmoid(), gaml::mlp::mlp_dsigmoid()); auto l2 = gaml::mlp::layer(l1, HIDDEN_LAYER_SIZE, gaml::mlp::mlp_identity(), gaml::mlp::mlp_didentity()); auto l3 = gaml::mlp::layer(l2, HIDDEN_LAYER_SIZE, gaml::mlp::mlp_tanh(), gaml::mlp::mlp_dtanh()); auto l4 = gaml::mlp::layer(l3, OUTPUT_DIM, gaml::mlp::mlp_sigmoid(), gaml::mlp::mlp_dsigmoid()); auto mlp = gaml::mlp::perceptron(l4, output_of); std::cout << "We use the following architecture : " << std::endl; std::cout << mlp << std::endl; std::cout << "which has a total of " << mlp.psize() << " parameters"<< std::endl; gaml::mlp::parameters_type params(mlp.psize()); gaml::mlp::parameters_type paramsph(mlp.psize()); gaml::mlp::values_type derivatives(mlp.psize()); gaml::mlp::values_type forward_sweep(mlp.size()); X x; auto loss_ce = gaml::mlp::loss::CrossEntropy(); auto loss_quadratic = gaml::mlp::loss::Quadratic(); auto f = [&mlp, ¶ms] (const typename decltype(mlp)::input_type& x) -> gaml::mlp::values_type { auto output = mlp(x, params); gaml::mlp::values_type voutput(mlp.output_size()); fillOutput(voutput.begin(), output); return voutput; }; auto df = [&mlp, &forward_sweep, ¶ms] (const typename decltype(mlp)::input_type& x, unsigned int parameter_dim) -> gaml::mlp::values_type { return mlp.deriv(x, params, forward_sweep, parameter_dim); }; unsigned int nbtrials = 100; unsigned int nbfails = 0; std::cout << "I will compare " << nbtrials << " times a numerical approximation and the analytical gradient we compute" << std::endl; for(unsigned int t = 0 ; t < nbtrials ; ++t) { randomize_data(params, -1.0, 1.0); randomize_data(x, -1.0, 1.0); // Compute the output at params auto output = mlp(x, params); gaml::mlp::values_type raw_output(OUTPUT_DIM); fillOutput(raw_output.begin(), output); gaml::mlp::values_type raw_outputph(OUTPUT_DIM); // For computing the loss, we need a target gaml::mlp::values_type raw_target(OUTPUT_DIM); randomize_data(raw_target); double norm_dh = 0.0; for(unsigned int i = 0 ; i < mlp.psize() ; ++i) { // Let us compute params + h*[0 0 0 0 0 0 1 0 0 0 0 0], the 1 at the ith position std::copy(params.begin(), params.end(), paramsph.begin()); double dh = (sqrt(DBL_EPSILON) * paramsph[i]); paramsph[i] += dh; norm_dh += dh*dh; // Compute the output at params + h auto outputph = mlp(x, paramsph); fillOutput(raw_outputph.begin(), outputph); // We now compute the approximation of the derivative if(quadratic_loss) derivatives[i] = (loss_quadratic(raw_target, raw_outputph) - loss_quadratic(raw_target, raw_output))/dh; else derivatives[i] = (loss_ce(raw_target, raw_outputph) - loss_ce(raw_target, raw_output))/dh; } // We now compute the analytical derivatives mlp(x, params); std::copy(mlp.begin(), mlp.end(), forward_sweep.begin()); gaml::mlp::values_type our_derivatives(mlp.psize()); for(unsigned int i = 0 ; i < mlp.psize() ; ++i) { if(quadratic_loss) our_derivatives[i] = loss_quadratic.deriv(x, raw_target, forward_sweep, f, df, i); else our_derivatives[i] = loss_ce.deriv(x, raw_target, forward_sweep, f, df, i); } // We finally compute the norm of the difference double error = 0.0; auto diter = derivatives.begin(); for(auto& ourdi : our_derivatives) { error = (ourdi - *diter) * (ourdi - *diter); diter++; } error = sqrt(error); std::cout << "Error between the analytical and numerical gradients " << error << " with a step size of " << sqrt(norm_dh) << " in norm" << std::endl; if(error > 1e-7) ++nbfails; /* std::cout << "numerical " << std::endl; for(auto & di : derivatives) std::cout << di << " "; std::cout << std::endl; std::cout << "our :" << std::endl; for(auto& di : our_derivatives) std::cout << di << " "; std::cout << std::endl; */ } std::cout << nbfails << " / " << nbtrials << " with an error higher than 1e-7" << std::endl; }
Vector ADFun<Base>::Forward( size_t p , const Vector& x_p , std::ostream& s ) { // temporary indices size_t i, j; // number of independent variables size_t n = ind_taddr_.size(); // number of dependent variables size_t m = dep_taddr_.size(); // check Vector is Simple Vector class with Base type elements CheckSimpleVector<Base, Vector>(); CPPAD_ASSERT_KNOWN( size_t(x_p.size()) == n, "Second argument to Forward does not have length equal to\n" "the dimension of the domain for the corresponding ADFun." ); CPPAD_ASSERT_KNOWN( p <= taylor_per_var_, "The number of taylor_ coefficient currently stored\n" "in this ADFun object is less than p." ); // check if the taylor_ matrix needs more columns if( taylor_col_dim_ <= p ) capacity_taylor(p + 1); CPPAD_ASSERT_UNKNOWN( taylor_col_dim_ > p ); // set the p-th order taylor_ coefficients for independent variables for(j = 0; j < n; j++) { CPPAD_ASSERT_UNKNOWN( ind_taddr_[j] < total_num_var_ ); // ind_taddr_[j] is operator taddr for j-th independent variable CPPAD_ASSERT_UNKNOWN( play_.GetOp( ind_taddr_[j] ) == InvOp ); // It is also variable taddr for j-th independent variable taylor_[ind_taddr_[j] * taylor_col_dim_ + p] = x_p[j]; } // evaluate the derivatives if( p == 0 ) { # if CPPAD_USE_FORWARD0SWEEP compare_change_ = forward0sweep(s, true, n, total_num_var_, &play_, taylor_col_dim_, taylor_.data() ); # else compare_change_ = forward_sweep(s, true, p, n, total_num_var_, &play_, taylor_col_dim_, taylor_.data() ); # endif } else forward_sweep(s, false, p, n, total_num_var_, &play_, taylor_col_dim_, taylor_.data() ); // return the p-th order taylor_ coefficients for dependent variables Vector y_p(m); for(i = 0; i < m; i++) { CPPAD_ASSERT_UNKNOWN( dep_taddr_[i] < total_num_var_ ); y_p[i] = taylor_[dep_taddr_[i] * taylor_col_dim_ + p]; } # ifndef NDEBUG if( hasnan(y_p) ) { if( p == 0 ) { CPPAD_ASSERT_KNOWN(false, "y = f.Forward(0, x): has a nan in y." ); } else { CPPAD_ASSERT_KNOWN(false, "y_p = f.Forward(p, x_p): has a nan in y_p for p > 0, " "but not for p = 0." ); } } # endif // now we have p + 1 taylor_ coefficients per variable taylor_per_var_ = p + 1; return y_p; }