CMap<TParameter*, SGVector<float64_t> > CLaplacianInferenceMethod:: get_marginal_likelihood_derivatives(CMap<TParameter*, CSGObject*>& para_dict) { check_members(); if(update_parameter_hash()) update_all(); MatrixXd Z(m_L.num_rows, m_L.num_cols); Map<VectorXd> eigen_dlp(dlp.vector, dlp.vlen); for (index_t i = 0; i < m_L.num_rows; i++) { for (index_t j = 0; j < m_L.num_cols; j++) Z(i,j) = m_L(i,j); } MatrixXd sW_temp(sW.vlen, m_ktrtr.num_cols); VectorXd sum(1); sum[0] = 0; for (index_t i = 0; i < sW.vlen; i++) { for (index_t j = 0; j < m_ktrtr.num_cols; j++) sW_temp(i,j) = sW[i]; } VectorXd g; Map<VectorXd> eigen_W(W.vector, W.vlen); Map<MatrixXd> eigen_temp_kernel(temp_kernel.matrix, temp_kernel.num_rows, temp_kernel.num_cols); if (eigen_W.minCoeff() < 0) { Z = -Z; MatrixXd A = MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols); MatrixXd temp_diagonal(sW.vlen, sW.vlen); temp_diagonal.setZero(sW.vlen, sW.vlen); for (index_t s = 0; s < temp_diagonal.rows(); s++) { for (index_t r = 0; r < temp_diagonal.cols(); r++) temp_diagonal(r,s) = W[s]; } A = A + eigen_temp_kernel*m_scale*m_scale*temp_diagonal; FullPivLU<MatrixXd> lu(A); MatrixXd temp_matrix = lu.inverse().cwiseProduct(eigen_temp_kernel*m_scale*m_scale); VectorXd temp_sum(temp_matrix.rows()); for (index_t i = 0; i < temp_matrix.rows(); i++) { for (index_t j = 0; j < temp_matrix.cols(); j++) temp_sum[i] += temp_matrix(j,i); } g = temp_sum/2.0; } else { MatrixXd C = Z.transpose().colPivHouseholderQr().solve( sW_temp.cwiseProduct(eigen_temp_kernel*m_scale*m_scale)); MatrixXd temp_diagonal(sW.vlen, sW.vlen); temp_diagonal.setZero(sW.vlen, sW.vlen); for (index_t s = 0; s < sW.vlen; s++) temp_diagonal(s,s) = sW[s]; MatrixXd temp = Z.transpose(); Z = Z.transpose().colPivHouseholderQr().solve(temp_diagonal); Z = temp.transpose().colPivHouseholderQr().solve(Z); for (index_t s = 0; s < Z.rows(); s++) { for (index_t r = 0; r < Z.cols(); r++) Z(s,r) *= sW[s]; } VectorXd temp_sum(C.rows()); temp_sum.setZero(C.rows()); for (index_t i = 0; i < C.rows(); i++) { for (index_t j = 0; j < C.cols(); j++) temp_sum[i] += C(j,i)*C(j,i); } g = (eigen_temp_kernel.diagonal()*m_scale*m_scale-temp_sum)/2.0; } Map<VectorXd> eigen_d3lp(d3lp.vector, d3lp.vlen); VectorXd dfhat = g.cwiseProduct(eigen_d3lp); m_kernel->build_parameter_dictionary(para_dict); m_mean->build_parameter_dictionary(para_dict); m_model->build_parameter_dictionary(para_dict); CMap<TParameter*, SGVector<float64_t> > gradient( 3+para_dict.get_num_elements(), 3+para_dict.get_num_elements()); for (index_t i = 0; i < para_dict.get_num_elements(); i++) { shogun::CMapNode<TParameter*, CSGObject*>* node = para_dict.get_node_ptr(i); TParameter* param = node->key; CSGObject* obj = node->data; index_t length = 1; if ((param->m_datatype.m_ctype== CT_VECTOR || param->m_datatype.m_ctype == CT_SGVECTOR) && param->m_datatype.m_length_y != NULL) length = *(param->m_datatype.m_length_y); SGVector<float64_t> variables(length); bool deriv_found = false; Map<VectorXd> eigen_temp_alpha(temp_alpha.vector, temp_alpha.vlen); for (index_t h = 0; h < length; h++) { SGMatrix<float64_t> deriv; SGVector<float64_t> mean_derivatives; VectorXd mean_dev_temp; SGVector<float64_t> lik_first_deriv; SGVector<float64_t> lik_second_deriv; if (param->m_datatype.m_ctype == CT_VECTOR || param->m_datatype.m_ctype == CT_SGVECTOR) { deriv = m_kernel->get_parameter_gradient(param, obj); lik_first_deriv = m_model->get_first_derivative( (CRegressionLabels*)m_labels, param, obj, function); lik_second_deriv = m_model->get_second_derivative( (CRegressionLabels*)m_labels, param, obj, function); mean_derivatives = m_mean->get_parameter_derivative( param, obj, m_feature_matrix, h); for (index_t d = 0; d < mean_derivatives.vlen; d++) mean_dev_temp[d] = mean_derivatives[d]; } else { mean_derivatives = m_mean->get_parameter_derivative( param, obj, m_feature_matrix); for (index_t d = 0; d < mean_derivatives.vlen; d++) mean_dev_temp[d] = mean_derivatives[d]; deriv = m_kernel->get_parameter_gradient(param, obj); lik_first_deriv = m_model->get_first_derivative( (CRegressionLabels*)m_labels, param, obj, function); lik_second_deriv = m_model->get_second_derivative( (CRegressionLabels*)m_labels, param, obj, function); } if (deriv.num_cols*deriv.num_rows > 0) { MatrixXd dK(deriv.num_cols, deriv.num_rows); for (index_t d = 0; d < deriv.num_rows; d++) { for (index_t s = 0; s < deriv.num_cols; s++) dK(d,s) = deriv(d,s); } sum[0] = (Z.cwiseProduct(dK)).sum()/2.0; sum = sum - eigen_temp_alpha.transpose()*dK*eigen_temp_alpha/2.0; VectorXd b = dK*eigen_dlp; sum = sum - dfhat.transpose()*(b-eigen_temp_kernel*(Z*b)*m_scale*m_scale); variables[h] = sum[0]; deriv_found = true; } else if (mean_derivatives.vlen > 0) { sum = -eigen_temp_alpha.transpose()*mean_dev_temp; sum = sum - dfhat.transpose()*(mean_dev_temp-eigen_temp_kernel* (Z*mean_dev_temp)*m_scale*m_scale); variables[h] = sum[0]; deriv_found = true; } else if (lik_first_deriv[0]+lik_second_deriv[0] != CMath::INFTY) { Map<VectorXd> eigen_fd(lik_first_deriv.vector, lik_first_deriv.vlen); Map<VectorXd> eigen_sd(lik_second_deriv.vector, lik_second_deriv.vlen); sum[0] = -g.dot(eigen_sd); sum[0] = sum[0] - eigen_fd.sum(); variables[h] = sum[0]; deriv_found = true; } } if (deriv_found) gradient.add(param, variables); } TParameter* param; index_t index = get_modsel_param_index("scale"); param = m_model_selection_parameters->get_parameter(index); MatrixXd dK(m_ktrtr.num_cols, m_ktrtr.num_rows); for (index_t d = 0; d < m_ktrtr.num_rows; d++) { for (index_t s = 0; s < m_ktrtr.num_cols; s++) dK(d,s) = m_ktrtr(d,s)*m_scale*2.0;; } Map<VectorXd> eigen_temp_alpha(temp_alpha.vector, temp_alpha.vlen); sum[0] = (Z.cwiseProduct(dK)).sum()/2.0; sum = sum - eigen_temp_alpha.transpose()*dK*eigen_temp_alpha/2.0; VectorXd b = dK*eigen_dlp; sum = sum - dfhat.transpose()*(b-eigen_temp_kernel*(Z*b)*m_scale*m_scale); SGVector<float64_t> scale(1); scale[0] = sum[0]; gradient.add(param, scale); para_dict.add(param, this); return gradient; }
CMap<TParameter*, SGVector<float64_t> > CFITCInferenceMethod:: get_marginal_likelihood_derivatives(CMap<TParameter*, CSGObject*>& para_dict) { if (update_parameter_hash()) update(); // get the sigma variable from the Gaussian likelihood model CGaussianLikelihood* lik=CGaussianLikelihood::obtain_from_generic(m_model); float64_t sigma=lik->get_sigma(); SG_UNREF(lik); Map<MatrixXd> eigen_ktru(m_ktru.matrix, m_ktru.num_rows, m_ktru.num_cols); MatrixXd W = eigen_ktru; Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen); for (index_t j = 0; j < eigen_ktru.rows(); j++) { for (index_t i = 0; i < eigen_ktru.cols(); i++) W(i,j) = eigen_ktru(i,j) / sqrt(eigen_dg[j]); } Map<MatrixXd> eigen_uu(m_kuu.matrix, m_kuu.num_rows, m_kuu.num_cols); LLT<MatrixXd> CholW(eigen_uu + W*W.transpose() + m_ind_noise*MatrixXd::Identity(eigen_uu.rows(), eigen_uu.cols())); W = CholW.matrixL(); W = W.colPivHouseholderQr().solve(eigen_ktru); SGVector<float64_t> y=((CRegressionLabels*) m_labels)->get_labels(); Map<VectorXd> eigen_y(y.vector, y.vlen); SGVector<float64_t> m=m_mean->get_mean_vector(m_feat); Map<VectorXd> eigen_m(m.vector, m.vlen); VectorXd al=W*(eigen_y-eigen_m).cwiseQuotient(eigen_dg); al = W.transpose()*al; al=(eigen_y-eigen_m)-al; al = al.cwiseQuotient(eigen_dg); MatrixXd iKuu = eigen_uu.selfadjointView<Eigen::Upper>().llt() .solve(MatrixXd::Identity(eigen_uu.rows(), eigen_uu.cols())); MatrixXd B = iKuu*eigen_ktru; MatrixXd Wdg = W; for (index_t j = 0; j < eigen_ktru.rows(); j++) { for (index_t i = 0; i < eigen_ktru.cols(); i++) Wdg(i,j) = Wdg(i,j) / eigen_dg[j]; } VectorXd w = B*al; VectorXd sum(1); sum[0] = 0; m_kernel->build_parameter_dictionary(para_dict); m_mean->build_parameter_dictionary(para_dict); //This will be the vector we return CMap<TParameter*, SGVector<float64_t> > gradient( 3+para_dict.get_num_elements(), 3+para_dict.get_num_elements()); for (index_t i = 0; i < para_dict.get_num_elements(); i++) { shogun::CMapNode<TParameter*, CSGObject*>* node = para_dict.get_node_ptr(i); TParameter* param = node->key; CSGObject* obj = node->data; index_t length = 1; if ((param->m_datatype.m_ctype== CT_VECTOR || param->m_datatype.m_ctype == CT_SGVECTOR) && param->m_datatype.m_length_y != NULL) length = *(param->m_datatype.m_length_y); SGVector<float64_t> variables(length); bool deriv_found = false; for (index_t g = 0; g < length; g++) { SGMatrix<float64_t> deriv; SGMatrix<float64_t> derivtru; SGMatrix<float64_t> derivuu; SGVector<float64_t> mean_derivatives; VectorXd mean_dev_temp; if (param->m_datatype.m_ctype == CT_VECTOR || param->m_datatype.m_ctype == CT_SGVECTOR) { m_kernel->init(m_features, m_features); deriv = m_kernel->get_parameter_gradient(param, obj); m_kernel->init(m_latent_features, m_features); derivtru = m_kernel->get_parameter_gradient(param, obj); m_kernel->init(m_latent_features, m_latent_features); derivuu = m_kernel->get_parameter_gradient(param, obj); m_kernel->remove_lhs_and_rhs(); mean_derivatives = m_mean->get_parameter_derivative( param, obj, m_feat, g); for (index_t d = 0; d < mean_derivatives.vlen; d++) mean_dev_temp[d] = mean_derivatives[d]; } else { mean_derivatives = m_mean->get_parameter_derivative( param, obj, m_feat); for (index_t d = 0; d < mean_derivatives.vlen; d++) mean_dev_temp[d] = mean_derivatives[d]; m_kernel->init(m_features, m_features); deriv = m_kernel->get_parameter_gradient(param, obj); m_kernel->init(m_latent_features, m_features); derivtru = m_kernel->get_parameter_gradient(param, obj); m_kernel->init(m_latent_features, m_latent_features); derivuu = m_kernel->get_parameter_gradient(param, obj); m_kernel->remove_lhs_and_rhs(); } sum[0] = 0; if (deriv.num_cols*deriv.num_rows > 0) { MatrixXd ddiagKi(deriv.num_cols, deriv.num_rows); MatrixXd dKuui(derivuu.num_cols, derivuu.num_rows); MatrixXd dKui(derivtru.num_cols, derivtru.num_rows); for (index_t d = 0; d < deriv.num_rows; d++) { for (index_t s = 0; s < deriv.num_cols; s++) ddiagKi(d,s) = deriv(d,s)*m_scale*m_scale; } for (index_t d = 0; d < derivuu.num_rows; d++) { for (index_t s = 0; s < derivuu.num_cols; s++) dKuui(d,s) = derivuu(d,s)*m_scale*m_scale; } for (index_t d = 0; d < derivtru.num_rows; d++) { for (index_t s = 0; s < derivtru.num_cols; s++) dKui(d,s) = derivtru(d,s)*m_scale*m_scale; } MatrixXd R = 2*dKui-dKuui*B; MatrixXd v = ddiagKi; MatrixXd temp = R.cwiseProduct(B); for (index_t d = 0; d < ddiagKi.rows(); d++) v(d,d) = v(d,d) - temp.col(d).sum(); sum = sum + ddiagKi.diagonal().transpose()* VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg); sum = sum + w.transpose()*(dKuui*w-2*(dKui*al)); sum = sum - al.transpose()*(v.diagonal().cwiseProduct(al)); MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg); VectorXd Wdg_sum(Wdg.rows()); for (index_t d = 0; d < Wdg.rows(); d++) Wdg_sum[d] = Wdg_temp.col(d).sum(); sum = sum - v.diagonal().transpose()*Wdg_sum; Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose()); sum[0] = sum[0] - Wdg_temp.sum(); sum /= 2.0; variables[g] = sum[0]; deriv_found = true; } else if (mean_derivatives.vlen > 0) { sum = mean_dev_temp*al; variables[g] = sum[0]; deriv_found = true; } } if (deriv_found) gradient.add(param, variables); } //Here we take the kernel scale derivative. { TParameter* param; index_t index = get_modsel_param_index("scale"); param = m_model_selection_parameters->get_parameter(index); SGVector<float64_t> variables(1); SGMatrix<float64_t> deriv; SGMatrix<float64_t> derivtru; SGMatrix<float64_t> derivuu; m_kernel->init(m_features, m_features); deriv = m_kernel->get_kernel_matrix(); m_kernel->init(m_latent_features, m_features); derivtru = m_kernel->get_kernel_matrix(); m_kernel->init(m_latent_features, m_latent_features); derivuu = m_kernel->get_kernel_matrix(); m_kernel->remove_lhs_and_rhs(); MatrixXd ddiagKi(deriv.num_cols, deriv.num_rows); MatrixXd dKuui(derivuu.num_cols, derivuu.num_rows); MatrixXd dKui(derivtru.num_cols, derivtru.num_rows); for (index_t d = 0; d < deriv.num_rows; d++) { for (index_t s = 0; s < deriv.num_cols; s++) ddiagKi(d,s) = deriv(d,s)*m_scale*2.0; } for (index_t d = 0; d < derivuu.num_rows; d++) { for (index_t s = 0; s < derivuu.num_cols; s++) dKuui(d,s) = derivuu(d,s)*m_scale*2.0; } for (index_t d = 0; d < derivtru.num_rows; d++) { for (index_t s = 0; s < derivtru.num_cols; s++) dKui(d,s) = derivtru(d,s)*m_scale*2.0; } MatrixXd R = 2*dKui-dKuui*B; MatrixXd v = ddiagKi; MatrixXd temp = R.cwiseProduct(B); for (index_t d = 0; d < ddiagKi.rows(); d++) v(d,d) = v(d,d) - temp.col(d).sum(); sum = sum + ddiagKi.diagonal().transpose()* VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg); sum = sum + w.transpose()*(dKuui*w-2*(dKui*al)); sum = sum - al.transpose()*(v.diagonal().cwiseProduct(al)); MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg); VectorXd Wdg_sum(Wdg.rows()); for (index_t d = 0; d < Wdg.rows(); d++) Wdg_sum[d] = Wdg_temp.col(d).sum(); sum = sum - v.diagonal().transpose()*Wdg_sum; Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose()); sum[0] = sum[0] - Wdg_temp.sum(); sum /= 2.0; variables[0] = sum[0]; gradient.add(param, variables); para_dict.add(param, this); } TParameter* param; index_t index; index = m_model->get_modsel_param_index("sigma"); param = m_model->m_model_selection_parameters->get_parameter(index); sum[0] = 0; MatrixXd W_temp = W.cwiseProduct(W); VectorXd W_sum(W_temp.rows()); for (index_t d = 0; d < W_sum.rows(); d++) W_sum[d] = W_temp.col(d).sum(); W_sum = W_sum.cwiseQuotient(eigen_dg.cwiseProduct(eigen_dg)); sum[0] = W_sum.sum(); sum = sum + al.transpose()*al; sum[0] = VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg).sum() - sum[0]; sum = sum*sigma*sigma; float64_t dKuui = 2.0*m_ind_noise; MatrixXd R = -dKuui*B; MatrixXd temp = R.cwiseProduct(B); VectorXd v(temp.rows()); for (index_t d = 0; d < temp.rows(); d++) v[d] = temp.col(d).sum(); sum = sum + (w.transpose()*dKuui*w)/2.0; sum = sum - al.transpose()*(v.cwiseProduct(al))/2.0; MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg); VectorXd Wdg_sum(Wdg.rows()); for (index_t d = 0; d < Wdg.rows(); d++) Wdg_sum[d] = Wdg_temp.col(d).sum(); sum = sum - v.transpose()*Wdg_sum/2.0; Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose()); sum[0] = sum[0] - Wdg_temp.sum()/2.0; SGVector<float64_t> vsigma(1); vsigma[0] = sum[0]; gradient.add(param, vsigma); para_dict.add(param, m_model); return gradient; }