예제 #1
0
CMap<TParameter*, SGVector<float64_t> > CLaplacianInferenceMethod::
get_marginal_likelihood_derivatives(CMap<TParameter*,
		CSGObject*>& para_dict)
{
	check_members();

	if(update_parameter_hash())
		update_all();

	MatrixXd Z(m_L.num_rows, m_L.num_cols);

	Map<VectorXd> eigen_dlp(dlp.vector, dlp.vlen);

	for (index_t i = 0; i < m_L.num_rows; i++)
	{
		for (index_t j = 0; j < m_L.num_cols; j++)
			Z(i,j) = m_L(i,j);
	}

	MatrixXd sW_temp(sW.vlen, m_ktrtr.num_cols);
	VectorXd sum(1);
	sum[0] = 0;


	for (index_t i = 0; i < sW.vlen; i++)
	{
		for (index_t j = 0; j < m_ktrtr.num_cols; j++)
			sW_temp(i,j) = sW[i];
	}

	VectorXd g;

	Map<VectorXd> eigen_W(W.vector, W.vlen);

	Map<MatrixXd> eigen_temp_kernel(temp_kernel.matrix, 
        	temp_kernel.num_rows, temp_kernel.num_cols);

	if (eigen_W.minCoeff() < 0)
	{
		Z = -Z;

		MatrixXd A = MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols);

		MatrixXd temp_diagonal(sW.vlen, sW.vlen);
		temp_diagonal.setZero(sW.vlen, sW.vlen);

		for (index_t s = 0; s < temp_diagonal.rows(); s++)
		{
			for (index_t r = 0; r < temp_diagonal.cols(); r++)
				temp_diagonal(r,s) = W[s];
		}

		A = A + eigen_temp_kernel*m_scale*m_scale*temp_diagonal;

		FullPivLU<MatrixXd> lu(A);

		MatrixXd temp_matrix =
				lu.inverse().cwiseProduct(eigen_temp_kernel*m_scale*m_scale);

		VectorXd temp_sum(temp_matrix.rows());

		for (index_t i = 0; i < temp_matrix.rows(); i++)
		{
			for (index_t j = 0; j < temp_matrix.cols(); j++)
				temp_sum[i] += temp_matrix(j,i);
		}

		g = temp_sum/2.0;
	}

	else
	{
		MatrixXd C = Z.transpose().colPivHouseholderQr().solve(
				sW_temp.cwiseProduct(eigen_temp_kernel*m_scale*m_scale));

		MatrixXd temp_diagonal(sW.vlen, sW.vlen);
		temp_diagonal.setZero(sW.vlen, sW.vlen);

		for (index_t s = 0; s < sW.vlen; s++)
			temp_diagonal(s,s) = sW[s];

		MatrixXd temp = Z.transpose();

		Z = Z.transpose().colPivHouseholderQr().solve(temp_diagonal);

		Z = temp.transpose().colPivHouseholderQr().solve(Z);

		for (index_t s = 0; s < Z.rows(); s++)
		{
			for (index_t r = 0; r < Z.cols(); r++)
				Z(s,r) *= sW[s];
		}

		VectorXd temp_sum(C.rows());

		temp_sum.setZero(C.rows());

		for (index_t i = 0; i < C.rows(); i++)
		{
			for (index_t j = 0; j < C.cols(); j++)
				temp_sum[i] += C(j,i)*C(j,i);
		}

		g = (eigen_temp_kernel.diagonal()*m_scale*m_scale-temp_sum)/2.0;
	}

	Map<VectorXd> eigen_d3lp(d3lp.vector, d3lp.vlen);

	VectorXd dfhat = g.cwiseProduct(eigen_d3lp);

	m_kernel->build_parameter_dictionary(para_dict);
	m_mean->build_parameter_dictionary(para_dict);
	m_model->build_parameter_dictionary(para_dict);

	CMap<TParameter*, SGVector<float64_t> > gradient(
			3+para_dict.get_num_elements(),
			3+para_dict.get_num_elements());

	for (index_t i = 0; i < para_dict.get_num_elements(); i++)
	{
		shogun::CMapNode<TParameter*, CSGObject*>* node =
				para_dict.get_node_ptr(i);

		TParameter* param = node->key;
		CSGObject* obj = node->data;

		index_t length = 1;

		if ((param->m_datatype.m_ctype== CT_VECTOR ||
				param->m_datatype.m_ctype == CT_SGVECTOR) &&
				param->m_datatype.m_length_y != NULL)
			length = *(param->m_datatype.m_length_y);

		SGVector<float64_t> variables(length);

		bool deriv_found = false;

		Map<VectorXd> eigen_temp_alpha(temp_alpha.vector,
			temp_alpha.vlen);

		for (index_t h = 0; h < length; h++)
		{

			SGMatrix<float64_t> deriv;
			SGVector<float64_t> mean_derivatives;
			VectorXd mean_dev_temp;
			SGVector<float64_t> lik_first_deriv;
			SGVector<float64_t> lik_second_deriv;

			if (param->m_datatype.m_ctype == CT_VECTOR ||
					param->m_datatype.m_ctype == CT_SGVECTOR)
			{
				deriv = m_kernel->get_parameter_gradient(param, obj);

				lik_first_deriv = m_model->get_first_derivative(
						(CRegressionLabels*)m_labels, param, obj, function);

				lik_second_deriv = m_model->get_second_derivative(
						(CRegressionLabels*)m_labels, param, obj, function);

				mean_derivatives = m_mean->get_parameter_derivative(
						param, obj, m_feature_matrix, h);

				for (index_t d = 0; d < mean_derivatives.vlen; d++)
					mean_dev_temp[d] = mean_derivatives[d];
			}

			else
			{
				mean_derivatives = m_mean->get_parameter_derivative(
						param, obj, m_feature_matrix);

				for (index_t d = 0; d < mean_derivatives.vlen; d++)
					mean_dev_temp[d] = mean_derivatives[d];

				deriv = m_kernel->get_parameter_gradient(param, obj);

				lik_first_deriv = m_model->get_first_derivative(
						(CRegressionLabels*)m_labels, param, obj, function);

				lik_second_deriv = m_model->get_second_derivative(
						(CRegressionLabels*)m_labels, param, obj, function);
			}

			if (deriv.num_cols*deriv.num_rows > 0)
			{
				MatrixXd dK(deriv.num_cols, deriv.num_rows);

				for (index_t d = 0; d < deriv.num_rows; d++)
				{
					for (index_t s = 0; s < deriv.num_cols; s++)
						dK(d,s) = deriv(d,s);
				}


				sum[0] = (Z.cwiseProduct(dK)).sum()/2.0;


				sum = sum - eigen_temp_alpha.transpose()*dK*eigen_temp_alpha/2.0;

				VectorXd b = dK*eigen_dlp;

				sum = sum -
						dfhat.transpose()*(b-eigen_temp_kernel*(Z*b)*m_scale*m_scale);

				variables[h] = sum[0];

				deriv_found = true;
			}

			else if (mean_derivatives.vlen > 0)
			{
				sum = -eigen_temp_alpha.transpose()*mean_dev_temp;
				sum = sum - dfhat.transpose()*(mean_dev_temp-eigen_temp_kernel*
						(Z*mean_dev_temp)*m_scale*m_scale);
				variables[h] = sum[0];
				deriv_found = true;
			}

			else if (lik_first_deriv[0]+lik_second_deriv[0] != CMath::INFTY)
			{
				Map<VectorXd> eigen_fd(lik_first_deriv.vector, lik_first_deriv.vlen);
				
				Map<VectorXd> eigen_sd(lik_second_deriv.vector, lik_second_deriv.vlen);

				sum[0] = -g.dot(eigen_sd);
				sum[0] = sum[0] - eigen_fd.sum();
				variables[h] = sum[0];
				deriv_found = true;
			}

		}

		if (deriv_found)
			gradient.add(param, variables);

	}

	TParameter* param;
	index_t index = get_modsel_param_index("scale");
	param = m_model_selection_parameters->get_parameter(index);

	MatrixXd dK(m_ktrtr.num_cols, m_ktrtr.num_rows);

	for (index_t d = 0; d < m_ktrtr.num_rows; d++)
	{
		for (index_t s = 0; s < m_ktrtr.num_cols; s++)
			dK(d,s) = m_ktrtr(d,s)*m_scale*2.0;;
	}

	Map<VectorXd> eigen_temp_alpha(temp_alpha.vector,
		temp_alpha.vlen);

	sum[0] = (Z.cwiseProduct(dK)).sum()/2.0;

	sum = sum - eigen_temp_alpha.transpose()*dK*eigen_temp_alpha/2.0;

	VectorXd b = dK*eigen_dlp;

	sum = sum - dfhat.transpose()*(b-eigen_temp_kernel*(Z*b)*m_scale*m_scale);

	SGVector<float64_t> scale(1);

	scale[0] = sum[0];

	gradient.add(param, scale);
	para_dict.add(param, this);

	return gradient;
}
예제 #2
0
CMap<TParameter*, SGVector<float64_t> > CFITCInferenceMethod::
get_marginal_likelihood_derivatives(CMap<TParameter*, CSGObject*>& para_dict)
{
	if (update_parameter_hash())
		update();

	// get the sigma variable from the Gaussian likelihood model
	CGaussianLikelihood* lik=CGaussianLikelihood::obtain_from_generic(m_model);
	float64_t sigma=lik->get_sigma();
	SG_UNREF(lik);

	Map<MatrixXd> eigen_ktru(m_ktru.matrix, m_ktru.num_rows, m_ktru.num_cols);

	MatrixXd W = eigen_ktru;

	Map<VectorXd> eigen_dg(m_dg.vector, m_dg.vlen);

	for (index_t j = 0; j < eigen_ktru.rows(); j++)
	{
		for (index_t i = 0; i < eigen_ktru.cols(); i++)
			W(i,j) = eigen_ktru(i,j) / sqrt(eigen_dg[j]);
	}

	Map<MatrixXd> eigen_uu(m_kuu.matrix, m_kuu.num_rows, m_kuu.num_cols);
	LLT<MatrixXd> CholW(eigen_uu + W*W.transpose() +
			m_ind_noise*MatrixXd::Identity(eigen_uu.rows(), eigen_uu.cols()));
	W = CholW.matrixL();


	W = W.colPivHouseholderQr().solve(eigen_ktru);

	SGVector<float64_t> y=((CRegressionLabels*) m_labels)->get_labels();
	Map<VectorXd> eigen_y(y.vector, y.vlen);
	SGVector<float64_t> m=m_mean->get_mean_vector(m_feat);
	Map<VectorXd> eigen_m(m.vector, m.vlen);

	VectorXd al=W*(eigen_y-eigen_m).cwiseQuotient(eigen_dg);

	al = W.transpose()*al;

	al=(eigen_y-eigen_m)-al;

	al = al.cwiseQuotient(eigen_dg);

	MatrixXd iKuu = eigen_uu.selfadjointView<Eigen::Upper>().llt()
					.solve(MatrixXd::Identity(eigen_uu.rows(), eigen_uu.cols()));

	MatrixXd B = iKuu*eigen_ktru;

	MatrixXd Wdg = W;

	for (index_t j = 0; j < eigen_ktru.rows(); j++)
	{
		for (index_t i = 0; i < eigen_ktru.cols(); i++)
			Wdg(i,j) = Wdg(i,j) / eigen_dg[j];
	}

	VectorXd w = B*al;

	VectorXd sum(1);
	sum[0] = 0;

	m_kernel->build_parameter_dictionary(para_dict);
	m_mean->build_parameter_dictionary(para_dict);

	//This will be the vector we return
	CMap<TParameter*, SGVector<float64_t> > gradient(
			3+para_dict.get_num_elements(),
			3+para_dict.get_num_elements());

	for (index_t i = 0; i < para_dict.get_num_elements(); i++)
	{
		shogun::CMapNode<TParameter*, CSGObject*>* node =
				para_dict.get_node_ptr(i);

		TParameter* param = node->key;
		CSGObject* obj = node->data;

		index_t length = 1;

		if ((param->m_datatype.m_ctype== CT_VECTOR ||
				param->m_datatype.m_ctype == CT_SGVECTOR) &&
				param->m_datatype.m_length_y != NULL)
			length = *(param->m_datatype.m_length_y);

		SGVector<float64_t> variables(length);

		bool deriv_found = false;

		for (index_t g = 0; g < length; g++)
		{

			SGMatrix<float64_t> deriv;
			SGMatrix<float64_t> derivtru;
			SGMatrix<float64_t> derivuu;
			SGVector<float64_t> mean_derivatives;
			VectorXd mean_dev_temp;

			if (param->m_datatype.m_ctype == CT_VECTOR ||
					param->m_datatype.m_ctype == CT_SGVECTOR)
			{
				m_kernel->init(m_features, m_features);
				deriv = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->init(m_latent_features, m_features);
				derivtru = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->init(m_latent_features, m_latent_features);
				derivuu = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->remove_lhs_and_rhs();

				mean_derivatives = m_mean->get_parameter_derivative(
						param, obj, m_feat, g);

				for (index_t d = 0; d < mean_derivatives.vlen; d++)
					mean_dev_temp[d] = mean_derivatives[d];
			}

			else
			{
				mean_derivatives = m_mean->get_parameter_derivative(
						param, obj, m_feat);

				for (index_t d = 0; d < mean_derivatives.vlen; d++)
					mean_dev_temp[d] = mean_derivatives[d];

				m_kernel->init(m_features, m_features);
				deriv = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->init(m_latent_features, m_features);
				derivtru = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->init(m_latent_features, m_latent_features);
				derivuu = m_kernel->get_parameter_gradient(param, obj);

				m_kernel->remove_lhs_and_rhs();
			}

			sum[0] = 0;


			if (deriv.num_cols*deriv.num_rows > 0)
			{
				MatrixXd ddiagKi(deriv.num_cols, deriv.num_rows);
				MatrixXd dKuui(derivuu.num_cols, derivuu.num_rows);
				MatrixXd dKui(derivtru.num_cols, derivtru.num_rows);

				for (index_t d = 0; d < deriv.num_rows; d++)
				{
					for (index_t s = 0; s < deriv.num_cols; s++)
						ddiagKi(d,s) = deriv(d,s)*m_scale*m_scale;
				}

				for (index_t d = 0; d < derivuu.num_rows; d++)
				{
					for (index_t s = 0; s < derivuu.num_cols; s++)
						dKuui(d,s) = derivuu(d,s)*m_scale*m_scale;
				}

				for (index_t d = 0; d < derivtru.num_rows; d++)
				{
					for (index_t s = 0; s < derivtru.num_cols; s++)
						dKui(d,s) = derivtru(d,s)*m_scale*m_scale;
				}

				MatrixXd R = 2*dKui-dKuui*B;
				MatrixXd v = ddiagKi;
				MatrixXd temp = R.cwiseProduct(B);

				for (index_t d = 0; d < ddiagKi.rows(); d++)
					v(d,d) = v(d,d) - temp.col(d).sum();

				sum = sum + ddiagKi.diagonal().transpose()*
						VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg);

				sum = sum + w.transpose()*(dKuui*w-2*(dKui*al));

				sum = sum - al.transpose()*(v.diagonal().cwiseProduct(al));

				MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg);

				VectorXd Wdg_sum(Wdg.rows());

				for (index_t d = 0; d < Wdg.rows(); d++)
					Wdg_sum[d] = Wdg_temp.col(d).sum();

				sum = sum - v.diagonal().transpose()*Wdg_sum;

				Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose());

				sum[0] = sum[0] - Wdg_temp.sum();

				sum /= 2.0;

				variables[g] = sum[0];
				deriv_found = true;
			}

			else if (mean_derivatives.vlen > 0)
			{
				sum = mean_dev_temp*al;
				variables[g] = sum[0];
				deriv_found = true;
			}


		}

		if (deriv_found)
			gradient.add(param, variables);

	}

	//Here we take the kernel scale derivative.
	{
		TParameter* param;
		index_t index = get_modsel_param_index("scale");
		param = m_model_selection_parameters->get_parameter(index);

		SGVector<float64_t> variables(1);

		SGMatrix<float64_t> deriv;
		SGMatrix<float64_t> derivtru;
		SGMatrix<float64_t> derivuu;

		m_kernel->init(m_features, m_features);
		deriv = m_kernel->get_kernel_matrix();

		m_kernel->init(m_latent_features, m_features);
		derivtru = m_kernel->get_kernel_matrix();

		m_kernel->init(m_latent_features, m_latent_features);
		derivuu = m_kernel->get_kernel_matrix();

		m_kernel->remove_lhs_and_rhs();

		MatrixXd ddiagKi(deriv.num_cols, deriv.num_rows);
		MatrixXd dKuui(derivuu.num_cols, derivuu.num_rows);
		MatrixXd dKui(derivtru.num_cols, derivtru.num_rows);

		for (index_t d = 0; d < deriv.num_rows; d++)
		{
			for (index_t s = 0; s < deriv.num_cols; s++)
				ddiagKi(d,s) = deriv(d,s)*m_scale*2.0;
		}

		for (index_t d = 0; d < derivuu.num_rows; d++)
		{
			for (index_t s = 0; s < derivuu.num_cols; s++)
				dKuui(d,s) = derivuu(d,s)*m_scale*2.0;
		}

		for (index_t d = 0; d < derivtru.num_rows; d++)
		{
			for (index_t s = 0; s < derivtru.num_cols; s++)
				dKui(d,s) = derivtru(d,s)*m_scale*2.0;
		}

		MatrixXd R = 2*dKui-dKuui*B;
		MatrixXd v = ddiagKi;
		MatrixXd temp = R.cwiseProduct(B);

		for (index_t d = 0; d < ddiagKi.rows(); d++)
			v(d,d) = v(d,d) - temp.col(d).sum();

		sum = sum + ddiagKi.diagonal().transpose()*

				VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg);

		sum = sum + w.transpose()*(dKuui*w-2*(dKui*al));

		sum = sum - al.transpose()*(v.diagonal().cwiseProduct(al));

		MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg);

		VectorXd Wdg_sum(Wdg.rows());

		for (index_t d = 0; d < Wdg.rows(); d++)
			Wdg_sum[d] = Wdg_temp.col(d).sum();

		sum = sum - v.diagonal().transpose()*Wdg_sum;

		Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose());

		sum[0] = sum[0] - Wdg_temp.sum();

		sum /= 2.0;

		variables[0] = sum[0];

		gradient.add(param, variables);
		para_dict.add(param, this);

	}

	TParameter* param;
	index_t index;

	index = m_model->get_modsel_param_index("sigma");
	param = m_model->m_model_selection_parameters->get_parameter(index);

	sum[0] = 0;

	MatrixXd W_temp = W.cwiseProduct(W);
	VectorXd W_sum(W_temp.rows());

	for (index_t d = 0; d < W_sum.rows(); d++)
		W_sum[d] = W_temp.col(d).sum();

	W_sum = W_sum.cwiseQuotient(eigen_dg.cwiseProduct(eigen_dg));

	sum[0] = W_sum.sum();

	sum = sum + al.transpose()*al;

	sum[0] = VectorXd::Ones(eigen_dg.rows()).cwiseQuotient(eigen_dg).sum() - sum[0];

	sum = sum*sigma*sigma;
	float64_t dKuui = 2.0*m_ind_noise;

	MatrixXd R = -dKuui*B;

	MatrixXd temp = R.cwiseProduct(B);
	VectorXd v(temp.rows());

	for (index_t d = 0; d < temp.rows(); d++)
		v[d] = temp.col(d).sum();

	sum = sum + (w.transpose()*dKuui*w)/2.0;

	sum = sum - al.transpose()*(v.cwiseProduct(al))/2.0;

	MatrixXd Wdg_temp = Wdg.cwiseProduct(Wdg);
	VectorXd Wdg_sum(Wdg.rows());

	for (index_t d = 0; d < Wdg.rows(); d++)
		Wdg_sum[d] = Wdg_temp.col(d).sum();

	sum = sum - v.transpose()*Wdg_sum/2.0;


	Wdg_temp = (R*Wdg.transpose()).cwiseProduct(B*Wdg.transpose());

	sum[0] = sum[0] - Wdg_temp.sum()/2.0;

	SGVector<float64_t> vsigma(1);

	vsigma[0] = sum[0];
	gradient.add(param, vsigma);
	para_dict.add(param, m_model);

	return gradient;

}