/** * Description not yet available. * \param */ dmatrix sin(const dmatrix& m) { ivector cmin(m.rowmin(),m.rowmax()); ivector cmax(m.rowmin(),m.rowmax()); int i; for (i=m.rowmin();i<=m.rowmax();i++) { cmin(i)=m(i).indexmin(); cmax(i)=m(i).indexmax(); } dmatrix tmp(m.rowmin(),m.rowmax(),cmin,cmax); for (i=m.rowmin();i<=m.rowmax();i++) { tmp(i)=sin(m(i)); } return tmp; }
/** * Description not yet available. * \param */ dmatrix elem_prod(const dmatrix& m, const dmatrix& m2) { ivector cmin(m.rowmin(),m.rowmax()); ivector cmax(m.rowmin(),m.rowmax()); int i; for (i=m.rowmin();i<=m.rowmax();i++) { cmin(i)=m(i).indexmin(); cmax(i)=m(i).indexmax(); } dmatrix tmp(m.rowmin(),m.rowmax(),cmin,cmax); for (i=m.rowmin();i<=m.rowmax();i++) { tmp(i)=elem_prod(m(i),m2(i)); } return tmp; }
/** Return the sum of the diagonal of a square matrix mat. \param mat is a square scalar matrix */ double trace(const dmatrix& mat) { if (mat.colmin() != mat.rowmin() || mat.colmax() != mat.rowmax() ) { cerr << "Matrix not square in trace\n"; ad_exit(1); } double sum = 0.0; for (int i = mat.colmin(); i <= mat.colmax(); ++i) { sum += mat.elem(i, i); } return sum; }
double clusteringValidity::getAverageToCentroidDiameter(const dmatrix& m1) const { dvector a(m1.columns()); int i,j; l2Distance<double> dist; double distance=0.0; for (i=0; i<m1.rows(); i++) { a.add(m1.getRow(i)); } a.divide(m1.rows()); for (j=0; j< m1.rows(); j++) { distance+=dist.apply(a,m1.getRow(j)); } if (m1.rows()>0) { return (2*distance/(double)m1.rows()); } else { return 2*distance; } }
double clusteringValidity::getMaximumDistance(const dmatrix& m1, const dmatrix& m2) const { int i,j; dmatrix distances(m1.rows(),m2.rows()); l2Distance<double> dist; for (i=0; i<m1.rows(); i++) { for (j=0; j<m2.rows(); j++) { distances[i][j]=dist.apply(m1.getRow(i),m2.getRow(j)); } } return distances.maximum(); }
double clusteringValidity::getAverageDistance(const dmatrix& m1, const dmatrix& m2) const { double distance=0.0; int i,j; l2Distance<double> dist; for (i=0; i<m1.rows(); i++) { for (j=0; j<m2.rows(); j++) { distance+=dist.apply(m1.getRow(i),m2.getRow(j)); } } distance=distance/((double)m1.rows()*(double)m2.rows()); return distance; }
double clusteringValidity::getAverageDiameter(const dmatrix& m1) const { double distance=0.0; int j,k; l2Distance<double> dist; for (j=0; j<m1.rows(); j++) { for (k=0; k<m1.rows(); k++) { distance+=dist.apply(m1.getRow(j), m1.getRow(k)); } } if (m1.rows()>1) { return (distance/((double)m1.rows()* (double)(m1.rows()-1))); } else { return distance; } }
/** * Description not yet available. * \param */ dvariable ghk(const dvar_vector& lower,const dvar_vector& upper, const dvar_matrix& Sigma, const dmatrix& eps) { RETURN_ARRAYS_INCREMENT(); int n=lower.indexmax(); int m=eps.indexmax(); dvariable ssum=0.0; dvar_matrix ch=choleski_decomp(Sigma); dvar_vector l(1,n); dvar_vector u(1,n); for (int k=1;k<=m;k++) { dvariable weight=1.0; l=lower; u=upper; for (int j=1;j<=n;j++) { l(j)/=ch(j,j); u(j)/=ch(j,j); dvariable Phiu=cumd_norm(u(j)); dvariable Phil=cumd_norm(l(j)); weight*=Phiu-Phil; dvariable eta=inv_cumd_norm((Phiu-Phil)*eps(k,j)+Phil+1.e-30); for (int i=j+1;i<=n;i++) { dvariable tmp=ch(i,j)*eta; l(i)-=tmp; u(i)-=tmp; } } ssum+=weight; } RETURN_ARRAYS_DECREMENT(); return ssum/m; }
/** * Description not yet available. * \param */ dmatrix symmetrize(const dmatrix& m) { if (m.rowmin() != m.colmin() || m.rowmax() != m.colmax() ) { cerr << " Non square matrix passed to dmatrix symmetrize\n"; ad_exit(1); } int rmin=m.rowmin(); int rmax=m.rowmax(); dmatrix s(rmin,rmax,rmin,rmax); for (int i=rmin; i<=rmax; i++) { s(i,i)=m(i,i); for (int j=rmin; j<i; j++) { s(i,j)=(m(i,j)+m(j,i))/2.; s(j,i)=s(i,j); } } return s; }
dvariable mult_likelihood(const dmatrix &o, const dvar_matrix &p, dvar_matrix &nu, const dvariable &log_vn) { // kludge to ensure observed and predicted matrixes are the same size if(o.colsize()!=p.colsize() || o.rowsize()!=p.rowsize()) { cerr<<"Error in multivariate_t_likelihood, observed and predicted matrixes" " are not the same size\n"; ad_exit(1); } dvariable vn = mfexp(log_vn); dvariable ff = 0.0; int r1 = o.rowmin(); int r2 = o.rowmax(); int c1 = o.colmin(); int c2 = o.colmax(); for(int i = r1; i <= r2; i++ ) { dvar_vector sobs = vn * o(i)/sum(o(i)); //scale observed numbers by effective sample size. ff -= gammln(vn); for(int j = c1; j <= c2; j++ ) { if( value(sobs(j)) > 0.0 ) ff += gammln(sobs(j)); } ff -= sobs * log(TINY + p(i)); dvar_vector o1=o(i)/sum(o(i)); dvar_vector p1=p(i)/sum(p(i)); nu(i) = elem_div(o1-p1,sqrt(elem_prod(p1,1.-p1)/vn)); } // exit(1); return ff; }
/** calculate Pseudo-Inverse using SVD(Singular Value Decomposition) by lapack library DGESVD (_a can be non-square matrix) */ int calcPseudoInverse(const dmatrix &_a, dmatrix &_a_pseu, double _sv_ratio) { int i, j, k; char jobu = 'A'; char jobvt = 'A'; int m = (int)_a.rows(); int n = (int)_a.cols(); int max_mn = max(m,n); int min_mn = min(m,n); dmatrix a(m,n); a = _a; int lda = m; double *s = new double[max_mn]; int ldu = m; double *u = new double[ldu*m]; int ldvt = n; double *vt = new double[ldvt*n]; int lwork = max(3*min_mn+max_mn, 5*min_mn); // for CLAPACK ver.2 & ver.3 double *work = new double[lwork]; int info; for(i = 0; i < max_mn; i++) s[i] = 0.0; dgesvd_(&jobu, &jobvt, &m, &n, &(a(0,0)), &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); double smin, smax=0.0; for (j = 0; j < min_mn; j++) if (s[j] > smax) smax = s[j]; smin = smax*_sv_ratio; // default _sv_ratio is 1.0e-3 for (j = 0; j < min_mn; j++) if (s[j] < smin) s[j] = 0.0; //------------ calculate pseudo inverse pinv(A) = V*S^(-1)*U^(T) // S^(-1)*U^(T) for (j = 0; j < m; j++){ if (s[j]){ for (i = 0; i < m; i++) u[j*m+i] /= s[j]; } else { for (i = 0; i < m; i++) u[j*m+i] = 0.0; } } // V * (S^(-1)*U^(T)) _a_pseu.resize(n,m); for(j = 0; j < n; j++){ for(i = 0; i < m; i++){ _a_pseu(j,i) = 0.0; for(k = 0; k < min_mn; k++){ if(s[k]) _a_pseu(j,i) += vt[j*n+k] * u[k*m+i]; } } } delete [] work; delete [] vt; delete [] s; delete [] u; return info; }
inline void orthogonalize(int N__, int n__, std::vector<wave_functions*> wfs__, int idx_bra__, int idx_ket__, dmatrix<T>& o__, wave_functions& tmp__) { PROFILE("sddk::wave_functions::orthogonalize"); auto pu = wfs__[0]->pu(); /* project out the old subspace: * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */ if (N__ > 0) { inner(*wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0); transform(pu, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__); } /* orthogonalize new n__ x n__ block */ inner(*wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, 0.0, o__, 0, 0); /* single MPI rank */ if (o__.blacs_grid().comm().size() == 1) { bool use_magma{false}; #if defined(__GPU) && defined(__MAGMA) if (pu == GPU) { use_magma = true; } #endif if (use_magma) { #ifdef __GPU /* Cholesky factorization */ if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) { std::stringstream s; s << "error in GPU factorization, info = " << info; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) { TERMINATE("error in inversion"); } #endif } else { /* CPU version */ //check_hermitian("OVLP", o__, n__); //o__.serialize("overlap.dat", n__); /* Cholesky factorization */ if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) { std::stringstream s; s << "error in factorization, info = " << info << std::endl << "number of existing states: " << N__ << std::endl << "number of new states: " << n__ << std::endl << "number of wave_functions: " << wfs__.size() << std::endl << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) { TERMINATE("error in inversion"); } if (pu == GPU) { #ifdef __GPU acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__); #endif } } /* CPU version */ if (pu == CPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { /* wave functions are complex, transformation matrix is complex */ if (std::is_same<T, double_complex>::value) { linalg<CPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), e->pw_coeffs().prime().at<CPU>(0, N__), e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), e->mt_coeffs().prime().at<CPU>(0, N__), e->mt_coeffs().prime().ld()); } } /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */ if (std::is_same<T, double>::value) { linalg<CPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(e->pw_coeffs().prime().at<CPU>(0, N__)), 2 * e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(e->mt_coeffs().prime().at<CPU>(0, N__)), 2 * e->mt_coeffs().prime().ld()); } } } } #ifdef __GPU if (pu == GPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { if (std::is_same<T, double_complex>::value) { double_complex alpha(1, 0); linalg<GPU>::trmm('R', 'U', 'N', e->pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), e->pw_coeffs().prime().at<GPU>(0, N__), e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', e->mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), e->mt_coeffs().prime().at<GPU>(0, N__), e->mt_coeffs().prime().ld()); } /* alpha should not go out of the scope, so wait */ acc::sync_stream(-1); } if (std::is_same<T, double>::value) { double alpha{1}; linalg<GPU>::trmm('R', 'U', 'N', 2 * e->pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(e->pw_coeffs().prime().at<GPU>(0, N__)), 2 * e->pw_coeffs().prime().ld()); if (e->has_mt() && e->mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', 2 * e->mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(e->mt_coeffs().prime().at<GPU>(0, N__)), 2 * e->mt_coeffs().prime().ld()); } acc::sync_stream(-1); } } acc::sync_stream(-1); } #endif } else { /* parallel transformation */ sddk::timer t1("sddk::wave_functions::orthogonalize|potrf"); if (int info = linalg<CPU>::potrf(n__, o__)) { std::stringstream s; s << "error in factorization, info = " << info; TERMINATE(s); } t1.stop(); sddk::timer t2("sddk::wave_functions::orthogonalize|trtri"); if (linalg<CPU>::trtri(n__, o__)) { TERMINATE("error in inversion"); } t2.stop(); /* o is upper triangular matrix */ for (int i = 0; i < n__; i++) { for (int j = i + 1; j < n__; j++) { o__.set(j, i, 0); } } /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */ for (auto& e: wfs__) { transform(pu, *e, N__, n__, o__, 0, 0, tmp__, 0, n__); e->copy_from(tmp__, 0, n__, N__, pu); } } }
/** solve linear equation using SVD(Singular Value Decomposition) by lapack library DGESVD (_a can be non-square matrix) */ int solveLinearEquationSVD(const dmatrix &_a, const dvector &_b, dvector &_x, double _sv_ratio) { const int m = _a.rows(); const int n = _a.cols(); assert( m == static_cast<int>(_b.size()) ); _x.resize(n); int i, j; char jobu = 'A'; char jobvt = 'A'; int max_mn = max(m,n); int min_mn = min(m,n); dmatrix a(m,n); a = _a; int lda = m; double *s = new double[max_mn]; // singular values int ldu = m; double *u = new double[ldu*m]; int ldvt = n; double *vt = new double[ldvt*n]; int lwork = max(3*min_mn+max_mn, 5*min_mn); // for CLAPACK ver.2 & ver.3 double *work = new double[lwork]; int info; for(i = 0; i < max_mn; i++) s[i] = 0.0; dgesvd_(&jobu, &jobvt, &m, &n, &(a(0,0)), &lda, s, u, &ldu, vt, &ldvt, work, &lwork, &info); double tmp; double smin, smax=0.0; for (j = 0; j < min_mn; j++) if (s[j] > smax) smax = s[j]; smin = smax*_sv_ratio; // 1.0e-3; for (j = 0; j < min_mn; j++) if (s[j] < smin) s[j] = 0.0; double *utb = new double[m]; // U^T*b for (j = 0; j < m; j++){ tmp = 0; if (s[j]){ for (i = 0; i < m; i++) tmp += u[j*m+i] * _b(i); tmp /= s[j]; } utb[j] = tmp; } // v*utb for (j = 0; j < n; j++){ tmp = 0; for (i = 0; i < n; i++){ if(s[i]) tmp += utb[i] * vt[j*n+i]; } _x(j) = tmp; } delete [] utb; delete [] work; delete [] vt; delete [] s; delete [] u; return info; }
int indexmin() { return U.indexmin(); }
inline void orthogonalize(device_t pu__, int num_sc__, int N__, int n__, std::vector<Wave_functions*> wfs__, int idx_bra__, int idx_ket__, dmatrix<T>& o__, wave_functions& tmp__) { PROFILE("sddk::wave_functions::orthogonalize"); /* project out the old subspace: * |\tilda phi_new> = |phi_new> - |phi_old><phi_old|phi_new> */ if (N__ > 0) { inner(num_sc__, *wfs__[idx_bra__], 0, N__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); transform(pu__, -1.0, wfs__, 0, N__, o__, 0, 0, 1.0, wfs__, N__, n__); } //if (true) { // inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); // linalg<CPU>::geqrf(n__, n__, o__, 0, 0); // auto diag = o__.get_diag(n__); // if (o__.blacs_grid().comm().rank() == 0) { // printf("diagonal of R-factor\n"); // for (int i = 0; i < n__; i++) { // if (std::abs(diag[i]) < 1e-6) { // std::cout << "small norm: " << i << " " << diag[i] << std::endl; // } // } // } // //std::vector<double> eo(n__); // //dmatrix<T> evec(o__.num_rows(), o__.num_cols(), o__.blacs_grid(), o__.bs_row(), o__.bs_col()); // //Eigenproblem_elpa1 evs(o__.blacs_grid(), o__.bs_row()); // //evs.solve(n__, n__, o__.template at<CPU>(), o__.ld(), eo.data(), evec.template at<CPU>(), evec.ld(), // // o__.num_rows_local(), o__.num_cols_local()); // //if (o__.blacs_grid().comm().rank() == 0) { // // std::cout << "smallest ev of the new n x x block: " << eo[0] << std::endl; // //} //} /* orthogonalize new n__ x n__ block */ inner(num_sc__, *wfs__[idx_bra__], N__, n__, *wfs__[idx_ket__], N__, n__, o__, 0, 0); /* single MPI rank */ if (o__.blacs_grid().comm().size() == 1) { bool use_magma{false}; #if defined(__GPU) && defined(__MAGMA) if (pu__ == GPU) { use_magma = true; } #endif if (use_magma) { #ifdef __GPU /* Cholesky factorization */ if (int info = linalg<GPU>::potrf(n__, o__.template at<GPU>(), o__.ld())) { std::stringstream s; s << "error in GPU factorization, info = " << info; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<GPU>::trtri(n__, o__.template at<GPU>(), o__.ld())) { TERMINATE("error in inversion"); } #endif } else { /* CPU version */ //check_hermitian("OVLP", o__, n__); //o__.serialize("overlap.dat", n__); /* Cholesky factorization */ if (int info = linalg<CPU>::potrf(n__, &o__(0, 0), o__.ld())) { std::stringstream s; s << "error in factorization, info = " << info << std::endl << "number of existing states: " << N__ << std::endl << "number of new states: " << n__ << std::endl << "number of wave_functions: " << wfs__.size() << std::endl << "idx_bra: " << idx_bra__ << " " << "idx_ket:" << idx_ket__; TERMINATE(s); } /* inversion of triangular matrix */ if (linalg<CPU>::trtri(n__, &o__(0, 0), o__.ld())) { TERMINATE("error in inversion"); } if (pu__ == GPU) { #ifdef __GPU acc::copyin(o__.template at<GPU>(), o__.ld(), o__.template at<CPU>(), o__.ld(), n__, n__); #endif } } for (int isc = 0; isc < num_sc__; isc++) { /* CPU version */ if (pu__ == CPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { /* alias for spin component of wave-functions */ auto& wfsc = e->component(isc); /* wave functions are complex, transformation matrix is complex */ if (std::is_same<T, double_complex>::value) { linalg<CPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), wfsc.pw_coeffs().prime().at<CPU>(0, N__), e->component(isc).pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, double_complex(1, 0), reinterpret_cast<double_complex*>(o__.template at<CPU>()), o__.ld(), wfsc.mt_coeffs().prime().at<CPU>(0, N__), wfsc.mt_coeffs().prime().ld()); } } /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */ if (std::is_same<T, double>::value) { linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<CPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, 1.0, reinterpret_cast<double*>(o__.template at<CPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<CPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld()); } } } } #ifdef __GPU if (pu__ == GPU) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { auto& wfsc = e->component(isc); if (std::is_same<T, double_complex>::value) { double_complex alpha(1, 0); linalg<GPU>::trmm('R', 'U', 'N', wfsc.pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), wfsc.pw_coeffs().prime().at<GPU>(0, N__), wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', wfsc.mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double_complex*>(o__.template at<GPU>()), o__.ld(), wfsc.mt_coeffs().prime().at<GPU>(0, N__), wfsc.mt_coeffs().prime().ld()); } /* alpha should not go out of the scope, so wait */ acc::sync_stream(-1); } if (std::is_same<T, double>::value) { double alpha{1}; linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.pw_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.pw_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.pw_coeffs().prime().ld()); if (wfsc.has_mt() && wfsc.mt_coeffs().num_rows_loc()) { linalg<GPU>::trmm('R', 'U', 'N', 2 * wfsc.mt_coeffs().num_rows_loc(), n__, &alpha, reinterpret_cast<double*>(o__.template at<GPU>()), o__.ld(), reinterpret_cast<double*>(wfsc.mt_coeffs().prime().at<GPU>(0, N__)), 2 * wfsc.mt_coeffs().prime().ld()); } acc::sync_stream(-1); } } acc::sync_stream(-1); } #endif } } else { /* parallel transformation */ sddk::timer t1("sddk::wave_functions::orthogonalize|potrf"); if (int info = linalg<CPU>::potrf(n__, o__)) { std::stringstream s; s << "error in factorization, info = " << info; TERMINATE(s); } t1.stop(); sddk::timer t2("sddk::wave_functions::orthogonalize|trtri"); if (linalg<CPU>::trtri(n__, o__)) { TERMINATE("error in inversion"); } t2.stop(); /* o is upper triangular matrix */ for (int i = 0; i < n__; i++) { for (int j = i + 1; j < n__; j++) { o__.set(j, i, 0); } } /* phi is transformed into phi, so we can't use it as the output buffer; use tmp instead and then overwrite phi */ for (auto& e: wfs__) { for (int isc = 0; isc < num_sc__; isc++) { transform(pu__, e->component(isc), N__, n__, o__, 0, 0, tmp__, 0, n__); e->component(isc).copy_from(tmp__, 0, n__, N__, pu__); } } } }
int indexmax() { return U.indexmax(); }
/** * Description not yet available. * \param */ dvar_matrix operator*(const dvar_matrix& m1, const dmatrix& cm2) { if (m1.colmin() != cm2.rowmin() || m1.colmax() != cm2.rowmax()) { cerr << " Incompatible array bounds in " "dmatrix operator*(const dvar_matrix& x, const dmatrix& m)\n"; ad_exit(21); } dmatrix cm1=value(m1); //dmatrix cm2=value(m2); dmatrix tmp(m1.rowmin(),m1.rowmax(), cm2.colmin(), cm2.colmax()); #ifdef OPT_LIB const size_t rowsize = (size_t)cm2.rowsize(); #else const int _rowsize = cm2.rowsize(); assert(_rowsize > 0); const size_t rowsize = (size_t)_rowsize; #endif try { double* temp_col = new double[rowsize]; temp_col-=cm2.rowmin(); for (int j=cm2.colmin(); j<=cm2.colmax(); j++) { for (int k=cm2.rowmin(); k<=cm2.rowmax(); k++) { temp_col[k] = cm2.elem(k,j); } for (int i=cm1.rowmin(); i<=cm1.rowmax(); i++) { double sum=0.0; dvector& temp_row = cm1(i); for (int k=cm1.colmin(); k<=cm1.colmax(); k++) { sum+=temp_row(k) * (temp_col[k]); // sum+=temp_row(k) * cm2(k,j); } tmp(i,j)=sum; } } temp_col+=cm2.rowmin(); delete [] temp_col; temp_col = 0; } catch (std::bad_alloc& e) { cerr << "Error[" << __FILE__ << ':' << __LINE__ << "]: Unable to allocate array.\n"; //ad_exit(21); throw e; } dvar_matrix vtmp=nograd_assign(tmp); save_identifier_string("TEST1"); //m1.save_dvar_matrix_value(); m1.save_dvar_matrix_position(); cm2.save_dmatrix_value(); cm2.save_dmatrix_position(); vtmp.save_dvar_matrix_position(); save_identifier_string("TEST6"); gradient_structure::GRAD_STACK1-> set_gradient_stack(dmcm_prod); return vtmp; }
double clusteringValidity::getAverageInterpointDistance(const dmatrix& m1, const dmatrix& m2) const { l2Distance<double> dist; int i; dvector a(m1.columns()); dvector b(m2.columns()); for (i=0; i<m1.rows();i++) { a.add(m1.getRow(i)); } a.divide(m1.rows()); // centroid 1 for (i=0; i<m2.rows();i++) { b.add(m2.getRow(i)); } b.divide(m2.rows()); // centroid 2 double distance=0.0; for (i=0; i<m1.rows(); i++) { distance+=dist.apply(m1.getRow(i),a); } for (i=0; i<m2.rows(); i++) { distance+=dist.apply(m2.getRow(i),b); } return (distance/(m1.rows()+m2.rows())); }
bool MLP::trainSteepestSequential(const dmatrix& data, const ivector& internalIds) { const parameters& param = getParameters(); char buffer[256]; bool abort = false; scramble<int> scrambler; int i,j,k; double tmpError; ivector idx; idx.resize(data.rows(),0,false,false); for (i=0;i<idx.size();++i) { idx.at(i)=i; } if (param.momentum > 0) { // with momentum dvector grad,delta(weights.size(),0.0); for (i=0; !abort && (i<param.maxNumberOfEpochs); ++i) { scrambler.apply(idx); // present the pattern in a random sequence totalError = 0; for (j=0;j<idx.size();++j) { k=idx.at(j); calcGradient(data.getRow(k),internalIds.at(k),grad); computeActualError(internalIds.at(k),tmpError); totalError+=tmpError; delta.addScaled(param.learnrate,grad,param.momentum,delta); weights.add(delta); } // update progress info object if (validProgressObject()) { sprintf(buffer,"Error=%f",totalError/errorNorm); getProgressObject().step(buffer); abort = abort || (totalError/errorNorm <= param.stopError); abort = abort || getProgressObject().breakRequested(); } } } else { // without momentum ivector idx; idx.resize(data.rows(),0,false,false); dvector grad; int i,j,k; double tmpError; for (i=0;i<idx.size();++i) { idx.at(i)=i; } for (i=0; !abort && (i<param.maxNumberOfEpochs); ++i) { scrambler.apply(idx); // present the pattern in a random sequence totalError = 0; for (j=0;j<idx.size();++j) { k=idx.at(j); calcGradient(data.getRow(k),internalIds.at(k),grad); computeActualError(internalIds.at(k),tmpError); totalError+=tmpError; weights.addScaled(param.learnrate,grad); } // update progress info object if (validProgressObject()) { sprintf(buffer,"Error=%f",totalError/errorNorm); getProgressObject().step(buffer); abort = abort || (totalError/errorNorm <= param.stopError); abort = abort || getProgressObject().breakRequested(); } } } return true; }
void Body::calcCMJacobian(Link *base, dmatrix &J) { // prepare subm, submwc JointPathPtr jp; if (base){ jp = getJointPath(rootLink(), base); Link *skip = jp->joint(0); skip->subm = rootLink()->m; skip->submwc = rootLink()->m*rootLink()->wc; Link *l = rootLink()->child; if (l){ if (l != skip) { l->calcSubMassCM(); skip->subm += l->subm; skip->submwc += l->submwc; } l = l->sibling; while(l){ if (l != skip){ l->calcSubMassCM(); skip->subm += l->subm; skip->submwc += l->submwc; } l = l->sibling; } } // assuming there is no branch between base and root for (int i=1; i<jp->numJoints(); i++){ l = jp->joint(i); l->subm = l->parent->m + l->parent->subm; l->submwc = l->parent->m*l->parent->wc + l->parent->submwc; } J.resize(3, numJoints()); }else{ rootLink()->calcSubMassCM(); J.resize(3, numJoints()+6); } // compute Jacobian std::vector<int> sgn(numJoints(), 1); if (jp) { for (int i=0; i<jp->numJoints(); i++) sgn[jp->joint(i)->jointId] = -1; } for (int i=0; i<numJoints(); i++){ Link *j = joint(i); switch(j->jointType){ case Link::ROTATIONAL_JOINT: { Vector3 omega(sgn[j->jointId]*j->R*j->a); Vector3 arm((j->submwc-j->subm*j->p)/totalMass_); Vector3 dp(omega.cross(arm)); J.col(j->jointId) = dp; break; } default: std::cerr << "calcCMJacobian() : unsupported jointType(" << j->jointType << std::endl; } } if (!base){ int c = numJoints(); J(0, c ) = 1.0; J(0, c+1) = 0.0; J(0, c+2) = 0.0; J(1, c ) = 0.0; J(1, c+1) = 1.0; J(1, c+2) = 0.0; J(2, c ) = 0.0; J(2, c+1) = 0.0; J(2, c+2) = 1.0; Vector3 dp(rootLink()->submwc/totalMass_ - rootLink()->p); J(0, c+3) = 0.0; J(0, c+4) = dp(2); J(0, c+5) = -dp(1); J(1, c+3) = -dp(2); J(1, c+4) = 0.0; J(1, c+5) = dp(0); J(2, c+3) = dp(1); J(2, c+4) = -dp(0); J(2, c+5) = 0.0; } }
// implements the Fuzzy C Means algorithm bool fuzzyCMeans::train(const dmatrix& data) { bool ok=true; int t=0; // create the distance functor according to the paramter norm distanceFunctor<double>* distFunc = 0; switch (getParameters().norm) { case parameters::L1: distFunc = new l1Distance<double>; break; case parameters::L2: distFunc = new l2Distance<double>; break; default: break; } int nbOfClusters=getParameters().nbOfClusters; int nbOfPoints=data.rows(); if(nbOfClusters>nbOfPoints) { setStatusString("more Clusters than points"); ok = false; } double q=getParameters().fuzzifier; if (q<=1) { setStatusString("q has to be bigger than 1"); ok = false; } // select some points of the given data to initialise the centroids selectRandomPoints(data,nbOfClusters,centroids); // initialize variables centroids.resize(nbOfClusters,data.columns(),0.0); dmatrix memberships(nbOfPoints, nbOfClusters, 0.0); double terminationCriterion=0; double newDistance; dvector newCenter(data.columns()); dvector currentPoint(data.columns()); dmatrix newCentroids(nbOfClusters,data.columns(),0.0); double sumOfMemberships=0; double membership=0; double dist1; double dist2; int i,j,k,m; do { // calculate new memberships memberships.fill(0.0); // clear old memberships for (i=0; i<nbOfPoints; i++) { for (j=0; j<nbOfClusters; j++) { newDistance=0; dist1=distFunc->apply(data.getRow(i), centroids.getRow(j)); for (k=0; k<nbOfClusters; k++) { dist2=distFunc->apply(data.getRow(i), centroids.getRow(k)); // if distance is 0, normal calculation of membership is not possible. if (dist2!=0) { newDistance+=pow((dist1/dist2),(1/(q-1))); } } // if point and centroid are equal if (newDistance!=0) memberships.at(i,j)=1/newDistance; else { dvector row(memberships.columns(),0.0); memberships.setRow(i,row); memberships.at(i,j)=1; break; } } } t++; // counts the iterations // calculate new centroids based on modified memberships for (m=0; m<nbOfClusters; m++) { newCenter.fill(0.0); sumOfMemberships=0; for (i=0; i<nbOfPoints; i++) { currentPoint=data.getRow(i); membership=pow(memberships.at(i,m),q); sumOfMemberships+=membership; currentPoint.multiply(membership); newCenter.add(currentPoint); } newCenter.divide(sumOfMemberships); newCentroids.setRow(m,newCenter); } terminationCriterion=distFunc->apply(centroids,newCentroids); centroids=newCentroids; } // the termination criterions while ( (terminationCriterion>getParameters().epsilon) && (t<getParameters().maxIterations)); int nbClusters = nbOfClusters; //Put the id information into the result object //Each cluster has the id of its position in the matrix ivector tids(nbClusters); for (i=0; i<nbClusters; i++) { tids.at(i)=i; } outTemplate=outputTemplate(tids); return ok; }
/** solve linear equation using LU decomposition by lapack library DGESVX (_a must be square matrix) */ int solveLinearEquationLU(const dmatrix &_a, const dvector &_b, dvector &_x) { assert(_a.cols() == _a.rows() && _a.cols() == _b.size() ); int n = (int)_a.cols(); int nrhs = 1; int lda = n; std::vector<int> ipiv(n); int ldb = n; int info; // compute the solution #ifndef USE_CLAPACK_INTERFACE char fact = 'N'; char transpose = 'N'; double *af = new double[n*n]; int ldaf = n; char equed = 'N'; double *r = new double[n]; double *c = new double[n]; int ldx = n; double rcond; double *ferr = new double[nrhs]; double *berr = new double[nrhs]; double *work = new double[4*n]; int *iwork = new int[n]; _x.resize(n); // memory allocation for the return vector dgesvx_(&fact, &transpose, &n, &nrhs, const_cast<double *>(&(_a(0,0))), &lda, af, &ldaf, &(ipiv[0]), &equed, r, c, const_cast<double *>(&(_b(0))), &ldb, &(_x(0)), &ldx, &rcond, ferr, berr, work, iwork, &info); delete [] iwork; delete [] work; delete [] berr; delete [] ferr; delete [] c; delete [] r; delete [] af; #else _x = _b; info = clapack_dgesv(CblasColMajor, n, nrhs, const_cast<double *>(&(a(0,0))), lda, &(ipiv[0]), &(_x(0)), ldb); #endif return info; }
bool normModHubertStat::apply(const std::vector<dmatrix>& clusteredData, double& index, const dmatrix& centroids) const { index =0.0; int nbClusters=clusteredData.size(); l2Distance<double> dist; int nbPoints=0; int i,j,k,l; // count the points that are in clusteredData for (i=0; i<nbClusters; i++) { nbPoints+=clusteredData[i].rows(); } // x is the distance matrix. It has in the i-th rows and j-th column // the distance between the i-th and j-th point // y is an other distance matrix. It has in the i-th row and j-th column // the distance of the centroids of the clusters they belong to. dmatrix x(nbPoints,nbPoints); dmatrix y(nbPoints,nbPoints); int row=0; int col=0; for (i=0; i<nbClusters; i++) { for (j=0; j<clusteredData[i].rows(); j++) { for (k=0; k<nbClusters; k++) { for (l=0; l<clusteredData[k].rows(); l++) { if (col>row) { y.at(row,col)=dist.apply(centroids.getRow(i), centroids.getRow(k)); x.at(row,col)=dist.apply(clusteredData[i].getRow(j), clusteredData[k].getRow(l)); } if (col<nbPoints-1) col++; else { col=0; row++; } } } } } double m=0.5*(nbPoints*(nbPoints-1)); double meanX=x.sumOfElements()/m; double meanY=y.sumOfElements()/m; double tmp1=meanX*meanX; double tmp2=meanY*meanY; double varianzX=0.0; double varianzY=0.0; for (i=0; i<nbPoints; i++) { for (j=i+1; j<nbPoints; j++) { varianzX+=x[i][j]*x[i][j]-tmp1; varianzY+=y[i][j]*y[i][j]-tmp2; } } varianzX=varianzX/m; varianzY=varianzY/m; varianzX=sqrt(varianzX); varianzY=sqrt(varianzY); double varianz=varianzX*varianzY; for (i=0; i<nbPoints; i++) { for (j=i+1; j<nbPoints; j++) { index+=(x[i][j]-meanX)*(y[i][j]-meanY); } } index=index/(m*varianz); return true; }
bool dunnIndex::apply(const std::vector<dmatrix>& clusteredData, double& index, const dmatrix& centroids) const { int nbClusters=clusteredData.size(); double denominator=0.0; int i,j; l2Distance<double> dist; dvector diameters(nbClusters); parameters param; param=getParameters(); // pointer to the function which implements the measure according to the // parameters double (lti::clusteringValidity::*diamFunc)(const dmatrix&) const; #ifdef _LTI_MSC_DOT_NET_2003 // nasty bug in this version of the .NET compiler #define QUALIFIER #else #define QUALIFIER <i::dunnIndex:: #endif switch (param.diameterMeasure) { case parameters::Standard: diamFunc=QUALIFIER getStandardDiameter; break; case parameters::Average: diamFunc=QUALIFIER getAverageDiameter; break; case parameters::Centroid: diamFunc=QUALIFIER getAverageToCentroidDiameter; break; default: diamFunc=QUALIFIER getStandardDiameter; setStatusString("Unknown diameterMeasure in clusteringValidity\n"); return false; } // compute all diameters of all clusters for (i=0; i<nbClusters; i++) { diameters[i]=(this->*diamFunc)(clusteredData[i]); } denominator=diameters.maximum(); // pointer to the function which calculates the distance of the functions // a pointer to a function is used, because the function will be called // many times later double (lti::clusteringValidity::*distFunc) (const dmatrix&,const dmatrix&) const ; // set pointer to function which is set by the parameter distanceMeasure switch (param.distanceMeasure) { case parameters::Minimum: distFunc=QUALIFIER getMinimumDistance; break; case parameters::Maximum: distFunc=QUALIFIER getMaximumDistance; break; case parameters::Mean: distFunc=QUALIFIER getAverageDistance; break; case parameters::Centroids: distFunc=QUALIFIER getCentroidDistance; break; case parameters::Interpoint: distFunc=QUALIFIER getAverageInterpointDistance; break; default: distFunc=QUALIFIER getAverageDistance; setStatusString("Unknown distanceMeasure in clusteringValidity\n"); return false; } // compute the distances of all clusters to each other int counter=0; dvector distanceVector(static_cast<int>(.5*(nbClusters*(nbClusters-1)))); for (i=0; i<nbClusters; i++) { for (j=i+1; j<nbClusters; j++) { if (distFunc==QUALIFIER getCentroidDistance) { distanceVector[counter]=dist.apply(centroids.getRow(i), centroids.getRow(j)); } else { distanceVector[counter]=(this->*distFunc)(clusteredData[i], clusteredData[j]); } counter++; } } distanceVector.divide(denominator); index=distanceVector.minimum(); return true; }
bool JointPathEx::calcJacobianInverseNullspace(dmatrix &J, dmatrix &Jinv, dmatrix &Jnull) { const int n = numJoints(); hrp::dmatrix w = hrp::dmatrix::Identity(n,n); // // wmat/weight: weighting joint angle weight // // w_i = 1 + | dH/dt | if d|dH/dt| >= 0 // = 1 if d|dH/dt| < 0 // dH/dt = (t_max - t_min)^2 (2t - t_max - t_min) // / 4 (t_max - t)^2 (t - t_min)^2 // // T. F. Chang and R.-V. Dubey: "A weighted least-norm solution based // scheme for avoiding joint limits for redundant manipulators", in IEEE // Trans. On Robotics and Automation, 11((2):286-292, April 1995. // for ( int j = 0; j < n ; j++ ) { double jang = joints[j]->q; double jmax = joints[j]->ulimit; double jmin = joints[j]->llimit; double e = deg2rad(1); if ( eps_eq(jang, jmax,e) && eps_eq(jang, jmin,e) ) { } else if ( eps_eq(jang, jmax,e) ) { jang = jmax - e; } else if ( eps_eq(jang, jmin,e) ) { jang = jmin + e; } double r; if ( eps_eq(jang, jmax,e) && eps_eq(jang, jmin,e) ) { r = DBL_MAX; } else { r = fabs( (pow((jmax - jmin),2) * (( 2 * jang) - jmax - jmin)) / (4 * pow((jmax - jang),2) * pow((jang - jmin),2)) ); if (std::isnan(r)) r = 0; } // If use_inside_joint_weight_retrieval = true (true by default), use T. F. Chang and R.-V. Dubeby weight retrieval inward. // Otherwise, joint weight is always calculated from limit value to resolve https://github.com/fkanehiro/hrpsys-base/issues/516. if (( r - avoid_weight_gain[j] ) >= 0 ) { w(j, j) = optional_weight_vector[j] * ( 1.0 / ( 1.0 + r) ); } else { if (use_inside_joint_weight_retrieval) w(j, j) = optional_weight_vector[j] * 1.0; else w(j, j) = optional_weight_vector[j] * ( 1.0 / ( 1.0 + r) ); } avoid_weight_gain[j] = r; } if ( DEBUG ) { std::cerr << " cost :"; for(int j = 0; j < n; j++ ) { std::cerr << std::setw(8) << std::setiosflags(std::ios::fixed) << std::setprecision(4) << avoid_weight_gain[j]; } std::cerr << std::endl; std::cerr << " optw :"; for(int j = 0; j < n; j++ ) { std::cerr << std::setw(8) << std::setiosflags(std::ios::fixed) << std::setprecision(4) << optional_weight_vector[j]; } std::cerr << std::endl; std::cerr << " w :"; for(int j = 0; j < n; j++ ) { std::cerr << std::setw(8) << std::setiosflags(std::ios::fixed) << std::setprecision(4) << w(j, j); } std::cerr << std::endl; } double manipulability = sqrt((J*J.transpose()).determinant()); double k = 0; if ( manipulability < manipulability_limit ) { k = manipulability_gain * pow((1 - ( manipulability / manipulability_limit )), 2); } if ( DEBUG ) { std::cerr << " manipulability = " << manipulability << " < " << manipulability_limit << ", k = " << k << " -> " << sr_gain * k << std::endl; } calcSRInverse(J, Jinv, sr_gain * k, w); Jnull = ( hrp::dmatrix::Identity(n, n) - Jinv * J); return true; }
inline void Band::set_fv_h_o<GPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { runtime::Timer t("sirius::Band::set_fv_h_o"); runtime::Timer t2("sirius::Band::set_fv_h_o|alloc"); h__.zero(); h__.allocate(memory_t::device); h__.zero_on_device(); o__.zero(); o__.allocate(memory_t::device); o__.zero_on_device(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 3> alm_row(kp__->num_gkvec_row(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> alm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); mdarray<double_complex, 3> halm_col(kp__->num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned | memory_t::device); t2.stop(); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ialoc] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } int s = iblk % 2; #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc], s), alm_row.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc], s), alm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc], s), halm_col.at<GPU>(0, offsets[ialoc], s), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) { alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } } alm_row_tmp.async_copy_to_device(tid); kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); alm_col_tmp.async_copy_to_device(tid); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); halm_col_tmp.async_copy_to_device(tid); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } acc::sync_stream(tid); } acc::sync_stream(omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), alm_col.at<GPU>(0, 0, s), alm_col.ld(), &zone, o__.at<GPU>(), o__.ld(), omp_get_max_threads()); linalg<GPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, &zone, alm_row.at<GPU>(0, 0, s), alm_row.ld(), halm_col.at<GPU>(0, 0, s), halm_col.ld(), &zone, h__.at<GPU>(), h__.ld(), omp_get_max_threads()); } acc::copyout(h__.at<CPU>(), h__.ld(), h__.at<GPU>(), h__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); acc::copyout(o__.at<CPU>(), o__.ld(), o__.at<GPU>(), o__.ld(), kp__->num_gkvec_row(), kp__->num_gkvec_col()); double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); h__.deallocate_on_device(); o__.deallocate_on_device(); }
/** calculate the mass matrix using the unit vector method \todo replace the unit vector method here with a more efficient method that only requires O(n) computation time The motion equation (dv != dvo) | | | dv | | | | fext | | out_M | * | dw | + | b1 | = | tauext | | | |ddq | | | | u | */ void Body::calcMassMatrix(dmatrix& out_M) { // buffers for the unit vector method dmatrix b1; dvector ddqorg; dvector uorg; Vector3 dvoorg; Vector3 dworg; Vector3 root_w_x_v; Vector3 g(0, 0, 9.8); uint nJ = numJoints(); int totaldof = nJ; if( !isStaticModel_ ) totaldof += 6; out_M.resize(totaldof,totaldof); b1.resize(totaldof, 1); // preserve and clear the joint accelerations ddqorg.resize(nJ); uorg.resize(nJ); for(uint i = 0; i < nJ; ++i){ Link* ptr = joint(i); ddqorg[i] = ptr->ddq; uorg [i] = ptr->u; ptr->ddq = 0.0; } // preserve and clear the root link acceleration dvoorg = rootLink_->dvo; dworg = rootLink_->dw; root_w_x_v = rootLink_->w.cross(rootLink_->vo + rootLink_->w.cross(rootLink_->p)); rootLink_->dvo = g - root_w_x_v; // dv = g, dw = 0 rootLink_->dw.setZero(); setColumnOfMassMatrix(b1, 0); if( !isStaticModel_ ){ for(int i=0; i < 3; ++i){ rootLink_->dvo[i] += 1.0; setColumnOfMassMatrix(out_M, i); rootLink_->dvo[i] -= 1.0; } for(int i=0; i < 3; ++i){ rootLink_->dw[i] = 1.0; Vector3 dw_x_p = rootLink_->dw.cross(rootLink_->p); // spatial acceleration caused by ang. acc. rootLink_->dvo -= dw_x_p; setColumnOfMassMatrix(out_M, i + 3); rootLink_->dvo += dw_x_p; rootLink_->dw[i] = 0.0; } } for(uint i = 0; i < nJ; ++i){ Link* ptr = joint(i); ptr->ddq = 1.0; int j = i + 6; setColumnOfMassMatrix(out_M, j); out_M(j, j) += ptr->Jm2; // motor inertia ptr->ddq = 0.0; } // subtract the constant term for(size_t i = 0; i < (size_t)out_M.cols(); ++i){ out_M.col(i) -= b1; } // recover state for(uint i = 0; i < nJ; ++i){ Link* ptr = joint(i); ptr->ddq = ddqorg[i]; ptr->u = uorg [i]; } rootLink_->dvo = dvoorg; rootLink_->dw = dworg; }
inline void Band::set_fv_h_o<CPU, electronic_structure_method_t::full_potential_lapwlo>(K_point* kp__, Periodic_function<double>* effective_potential__, dmatrix<double_complex>& h__, dmatrix<double_complex>& o__) const { PROFILE_WITH_TIMER("sirius::Band::set_fv_h_o"); h__.zero(); o__.zero(); double_complex zone(1, 0); int num_atoms_in_block = 2 * omp_get_max_threads(); int nblk = unit_cell_.num_atoms() / num_atoms_in_block + std::min(1, unit_cell_.num_atoms() % num_atoms_in_block); DUMP("nblk: %i", nblk); int max_mt_aw = num_atoms_in_block * unit_cell_.max_mt_aw_basis_size(); DUMP("max_mt_aw: %i", max_mt_aw); mdarray<double_complex, 2> alm_row(kp__->num_gkvec_row(), max_mt_aw); mdarray<double_complex, 2> alm_col(kp__->num_gkvec_col(), max_mt_aw); mdarray<double_complex, 2> halm_col(kp__->num_gkvec_col(), max_mt_aw); runtime::Timer t1("sirius::Band::set_fv_h_o|zgemm"); for (int iblk = 0; iblk < nblk; iblk++) { int num_mt_aw = 0; std::vector<int> offsets(num_atoms_in_block); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); offsets[ia - iblk * num_atoms_in_block] = num_mt_aw; num_mt_aw += type.mt_aw_basis_size(); } #ifdef __PRINT_OBJECT_CHECKSUM alm_row.zero(); alm_col.zero(); halm_col.zero(); #endif #pragma omp parallel { int tid = omp_get_thread_num(); for (int ia = iblk * num_atoms_in_block; ia < std::min(unit_cell_.num_atoms(), (iblk + 1) * num_atoms_in_block); ia++) { if (ia % omp_get_num_threads() == tid) { int ialoc = ia - iblk * num_atoms_in_block; auto& atom = unit_cell_.atom(ia); auto& type = atom.type(); mdarray<double_complex, 2> alm_row_tmp(alm_row.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_row(), type.mt_aw_basis_size()); mdarray<double_complex, 2> alm_col_tmp(alm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); mdarray<double_complex, 2> halm_col_tmp(halm_col.at<CPU>(0, offsets[ialoc]), kp__->num_gkvec_col(), type.mt_aw_basis_size()); kp__->alm_coeffs_row()->generate(ia, alm_row_tmp); for (int xi = 0; xi < type.mt_aw_basis_size(); xi++) { for (int igk = 0; igk < kp__->num_gkvec_row(); igk++) alm_row_tmp(igk, xi) = std::conj(alm_row_tmp(igk, xi)); } kp__->alm_coeffs_col()->generate(ia, alm_col_tmp); apply_hmt_to_apw<spin_block_t::nm>(atom, kp__->num_gkvec_col(), alm_col_tmp, halm_col_tmp); /* setup apw-lo and lo-apw blocks */ set_fv_h_o_apw_lo(kp__, type, atom, ia, alm_row_tmp, alm_col_tmp, h__, o__); } } } #ifdef __PRINT_OBJECT_CHECKSUM double_complex z1 = alm_row.checksum(); double_complex z2 = alm_col.checksum(); double_complex z3 = halm_col.checksum(); DUMP("checksum(alm_row): %18.10f %18.10f", std::real(z1), std::imag(z1)); DUMP("checksum(alm_col): %18.10f %18.10f", std::real(z2), std::imag(z2)); DUMP("checksum(halm_col): %18.10f %18.10f", std::real(z3), std::imag(z3)); #endif linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), alm_col.at<CPU>(), alm_col.ld(), zone, o__.at<CPU>(), o__.ld()); linalg<CPU>::gemm(0, 1, kp__->num_gkvec_row(), kp__->num_gkvec_col(), num_mt_aw, zone, alm_row.at<CPU>(), alm_row.ld(), halm_col.at<CPU>(), halm_col.ld(), zone, h__.at<CPU>(), h__.ld()); } double tval = t1.stop(); if (kp__->comm().rank() == 0) { DUMP("effective zgemm performance: %12.6f GFlops", 2 * 8e-9 * kp__->num_gkvec() * kp__->num_gkvec() * unit_cell_.mt_aw_basis_size() / tval); } /* add interstitial contributon */ set_fv_h_o_it(kp__, effective_potential__, h__, o__); /* setup lo-lo block */ set_fv_h_o_lo_lo(kp__, h__, o__); }
// Calls the same method of the superclass. bool svm::genericTrain(const dmatrix& input, const ivector& ids) { char buffer[80]; if (validProgressObject()) { getProgressObject().reset(); getProgressObject().setTitle("SVM: Training"); getProgressObject().setMaxSteps(nClasses); } bias.resize(nClasses,getParameters().bias,false,true); trainData=new dmatrix(input); alpha.resize(nClasses,input.rows(),0,false,true); makeTargets(ids); errorCache.resize(input.rows()); const parameters& param=getParameters(); C=param.C; tolerance=param.tolerance; epsilon=param.epsilon; bool abort=false; // train one SVM for each class for (int cid=0; cid<nClasses && !abort; cid++) { int numChanged=0; bool examineAll=true; currentTarget=&target->getRow(cid); currentClass=cid; currentAlpha=&alpha.getRow(cid); _lti_debug("Training class " << cid << "\n"); fillErrorCache(); while ((numChanged > 0 || examineAll) && !abort) { numChanged=0; if (examineAll) { // iterate over all alphas for (int i=0; i<trainData->rows(); i++) { if (examineExample(i)) { numChanged++; } } // next turn, look only at non-bound alphas examineAll=false; } else { // iterate over all non-0 and non-C alphas int *tmpAlpha=new int[alpha.getRow(cid).size()]; int j=0,i=0; for (i=0; i<alpha.getRow(cid).size(); i++) { if (alpha.getRow(cid).at(i) != 0.0 && alpha.getRow(cid).at(i) != C) { tmpAlpha[j++]=i; } } delete[] tmpAlpha; for (i=0; i<j; i++) { if (examineExample(i)) { numChanged++; } } // next turn, examine all if we did not succeed this time if (numChanged == 0) { examineAll=true; } } } // update progress info object if (validProgressObject()) { sprintf(buffer,"numChanged=%d, error=%f",numChanged,errorSum); getProgressObject().step(buffer); abort=abort || getProgressObject().breakRequested(); } // now limit the number of support vectors // does not work yet, so disable it if (0) { int supnum=0; ivector index(currentAlpha->size()); ivector newindex(currentAlpha->size()); dvector newkey(currentAlpha->size()); for (int i=0; i<currentAlpha->size(); i++) { if (currentAlpha->at(i) > 0) { supnum++; } index[i]=i; } if (supnum > param.nSupport && param.nSupport > 0) { lti::sort2<double> sorter; sorter.apply(*currentAlpha,index,newkey,newindex); int i; for (i=0; i<newkey.size() && lti::abs(newkey[i]) > std::numeric_limits<double>::epsilon(); i++) { } for (int j=i; j<currentAlpha->size()-param.nSupport; j++) { currentAlpha->at(newindex[j])=0; } _lti_debug("Final alpha: " << *currentAlpha << std::endl); } } } defineOutputTemplate(); _lti_debug("alpha:\n" << alpha << "\n"); // make sure that all lagrange multipliers are larger than // zero, otherwise we might get into trouble later alpha.apply(rectify); if (abort) { setStatusString("Training aborted by user!"); } return !abort; }
// TODO: comment your train method bool MLP::train(const dvector& theWeights, const dmatrix& data, const ivector& ids) { if (data.empty()) { setStatusString("Train data empty"); return false; } if (ids.size()!=data.rows()) { std::string str; str = "dimensionality of IDs vector and the number of rows "; str+= "of the input matrix must have the same size."; setStatusString(str.c_str()); return false; } // tracks the status of the training process. // if an error occurs set to false and use setStatusString() // however, training should continue, fixing the error as well as possible bool b=true; // vector with internal ids ivector newIds,idsLUT; newIds.resize(ids.size(),0,false,false); // map to get the internal Id to an external Id; std::map<int,int> extToInt; std::map<int,int>::iterator it; int i,k; for (i=0,k=0;i<ids.size();++i) { it = extToInt.find(ids.at(i)); if (it != extToInt.end()) { newIds.at(i) = (*it).second; } else { extToInt[ids.at(i)] = k; newIds.at(i) = k; ++k; } } idsLUT.resize(extToInt.size()); for (it=extToInt.begin();it!=extToInt.end();++it) { idsLUT.at((*it).second) = (*it).first; } // initialize the inputs and output units from the given data outputs = idsLUT.size(); inputs = data.columns(); const parameters& param = getParameters(); // display which kind of algorithm is to be used if (validProgressObject()) { getProgressObject().reset(); std::string str("MLP: Training using "); switch(param.trainingMode) { case parameters::ConjugateGradients: str += "conjugate gradients"; break; case parameters::SteepestDescent: str += "steepest descent"; break; default: str += "unnamed method"; } getProgressObject().setTitle(str); getProgressObject().setMaxSteps(param.maxNumberOfEpochs+1); } dvector grad; if (&theWeights != &weights) { weights.copy(theWeights); } if (!initWeights(true)) { // keep the weights setStatusString("Wrong weights!"); return false; }; computeErrorNorm(newIds); if (param.trainingMode == parameters::ConjugateGradients) { b = trainConjugateGradients(data,newIds); } else { if (param.batchMode) { // batch training mode: b = trainSteepestBatch(data,newIds); } else { // sequential training mode: b = trainSteepestSequential(data,newIds); } } if (validProgressObject()) { getProgressObject().step("Training ready."); } outputTemplate tmpOutTemp(idsLUT); setOutputTemplate(tmpOutTemp); // create the appropriate outputTemplate makeOutputTemplate(outputs,data,ids); return b; }