Exemple #1
0
slep_result_t malsar_low_rank(
		CDotFeatures* features,
		double* y,
		double rho,
		const slep_options& options)
{
	int task;
	int n_feats = features->get_dim_feature_space();
	SG_SDEBUG("n feats = %d\n", n_feats);
	int n_vecs = features->get_num_vectors();
	SG_SDEBUG("n vecs = %d\n", n_vecs);
	int n_tasks = options.n_tasks;
	SG_SDEBUG("n tasks = %d\n", n_tasks);

	int iter = 0;

	// initialize weight vector and bias for each task
	MatrixXd Ws = MatrixXd::Zero(n_feats, n_tasks);
	VectorXd Cs = VectorXd::Zero(n_tasks);
	for (task=0; task<n_tasks; task++)
	{
		int n_pos = 0;
		int n_neg = 0;
		for (int i=options.ind[task]; i<options.ind[task+1]; i++)
		{
			if (y[i] > 0)
				n_pos++;
			else
				n_neg++;
		}
		Cs[task] = CMath::log(double(n_pos)/n_neg);
	}

	MatrixXd Wz=Ws, Wzp=Ws, Wz_old=Ws, delta_Wzp=Ws, gWs=Ws;
	VectorXd Cz=Cs, Czp=Cs, Cz_old=Cs, delta_Czp=Cs, gCs=Cs;

	double t=1, t_old=0;
	double gamma=1, gamma_inc=2;
	double obj=0.0, obj_old=0.0;

	internal::set_is_malloc_allowed(false);
	bool done = false;
	while (!done && iter <= options.max_iter)
	{
		double alpha = double(t_old - 1)/t;

		// compute search point
		Ws = (1+alpha)*Wz - alpha*Wz_old;
		Cs = (1+alpha)*Cz - alpha*Cz_old;

		// zero gradient
		gWs.setZero();
		gCs.setZero();

		// compute gradient and objective at search point
		double Fs = 0;
		for (task=0; task<n_tasks; task++)
		{
			for (int i=options.ind[task]; i<options.ind[task+1]; i++)
			{
				double aa = -y[i]*(features->dense_dot(i, Ws.col(task).data(), n_feats)+Cs[task]);
				double bb = CMath::max(aa,0.0);

				// avoid underflow when computing exponential loss
				Fs += (CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb)/n_vecs;
				double b = -y[i]*(1 - 1/(1+CMath::exp(aa)))/n_vecs;

				gCs[task] += b;
				features->add_to_dense_vec(b, i, gWs.col(task).data(), n_feats);
			}
		}
		gWs.noalias() += 2*rho*Ws;
		
		// add regularizer
		Fs += rho*Ws.squaredNorm();

		double Fzp = 0.0;

		// line search, Armijo-Goldstein scheme
		while (true)
		{
			// compute trace projection of Ws - gWs/gamma with 2*rho/gamma
			internal::set_is_malloc_allowed(true);
			Wzp.setZero();
			JacobiSVD<MatrixXd> svd(Ws - gWs/gamma,ComputeThinU | ComputeThinV);
			for (int i=0; i<svd.singularValues().size(); i++)
			{
				if (svd.singularValues()[i] > 2*rho/gamma)
					Wzp += svd.matrixU().col(i)*
					       svd.singularValues()[i]*
					       svd.matrixV().col(i).transpose();
			}
			internal::set_is_malloc_allowed(false);
			// walk in direction of antigradient 
			Czp = Cs - gCs/gamma;
			
			// compute objective at line search point
			Fzp = 0.0;
			for (task=0; task<n_tasks; task++)
			{
				for (int i=options.ind[task]; i<options.ind[task+1]; i++)
				{
					double aa = -y[i]*(features->dense_dot(i, Wzp.col(task).data(), n_feats)+Cs[task]);
					double bb = CMath::max(aa,0.0);

					Fzp += (CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb)/n_vecs;
				}
			}
			Fzp += rho*Wzp.squaredNorm();

			// compute delta between line search point and search point
			delta_Wzp = Wzp - Ws;
			delta_Czp = Czp - Cs;

			// norms of delta
			double nrm_delta_Wzp = delta_Wzp.squaredNorm();
			double nrm_delta_Czp = delta_Czp.squaredNorm();

			double r_sum = (nrm_delta_Wzp + nrm_delta_Czp)/2;

			double Fzp_gamma = Fs + (delta_Wzp.transpose()*gWs).trace() +
				(delta_Czp.transpose()*gCs).trace() +
				(gamma/2)*nrm_delta_Wzp +
				(gamma/2)*nrm_delta_Czp;

			// break if delta is getting too small
			if (r_sum <= 1e-20)
			{
				done = true;
				break;
			}

			// break if objective at line searc point is smaller than Fzp_gamma
			if (Fzp <= Fzp_gamma)
				break;
			else
				gamma *= gamma_inc;
		}

		Wz_old = Wz;
		Cz_old = Cz;
		Wz = Wzp;
		Cz = Czp;

		// compute objective value
		obj_old = obj;
		obj = Fzp;
		internal::set_is_malloc_allowed(true);
		JacobiSVD<MatrixXd> svd(Wzp, EigenvaluesOnly);
		obj += rho*svd.singularValues().sum();
		internal::set_is_malloc_allowed(false);

		// check if process should be terminated 
		switch (options.termination)
		{
			case 0:
				if (iter>=2)
				{
					if ( CMath::abs(obj-obj_old) <= options.tolerance )
						done = true;
				}
			break;
			case 1:
				if (iter>=2)
				{
					if ( CMath::abs(obj-obj_old) <= options.tolerance*CMath::abs(obj_old))
						done = true;
				}
			break;
			case 2:
				if (CMath::abs(obj) <= options.tolerance)
					done = true;
			break;
			case 3:
				if (iter>=options.max_iter)
					done = true;
			break;
		}

		iter++;
		t_old = t;
		t = 0.5 * (1 + CMath::sqrt(1.0 + 4*t*t));
	}
	SG_SDEBUG("%d iteration passed, objective = %f\n",iter,obj);

	SGMatrix<float64_t> tasks_w(n_feats, n_tasks);
	for (int i=0; i<n_feats; i++)
	{
		for (task=0; task<n_tasks; task++)
			tasks_w[i] = Wzp(i,task);
	}
	SGVector<float64_t> tasks_c(n_tasks);
	for (int i=0; i<n_tasks; i++) tasks_c[i] = Czp[i];
	return slep_result_t(tasks_w, tasks_c);
};
slep_result_t slep_mc_plain_lr(
		CDotFeatures* features,
		CMulticlassLabels* labels,
		float64_t z,
		const slep_options& options)
{
	int i,j;
	// obtain problem parameters
	int n_feats   = features->get_dim_feature_space();
	int n_vecs    = features->get_num_vectors();
	int n_classes = labels->get_num_classes();

	// labels vector containing values in range (0 .. n_classes)
	SGVector<float64_t> labels_vector = labels->get_labels();

	// initialize matrices and vectors to be used
	// weight vector
	MatrixXd w  = MatrixXd::Zero(n_feats, n_classes);
	// intercepts (biases)
	VectorXd c  = VectorXd::Zero(n_classes);

	if (options.last_result)
	{
		SGMatrix<float64_t> last_w = options.last_result->w;
		SGVector<float64_t> last_c = options.last_result->c;
		for (i=0; i<n_classes; i++)
		{
			c[i] = last_c[i];
			for (j=0; j<n_feats; j++)
				w(j,i) = last_w(j,i);
		}
	}
	// iterative process matrices and vectors
	MatrixXd wp = w, wwp = MatrixXd::Zero(n_feats, n_classes);
	VectorXd cp = c, ccp = VectorXd::Zero(n_classes);
	// search point weight vector
	MatrixXd search_w = MatrixXd::Zero(n_feats, n_classes);
	// search point intercepts
	VectorXd search_c = VectorXd::Zero(n_classes);
	// dot products
	MatrixXd Aw  = MatrixXd::Zero(n_vecs, n_classes);
	for (j=0; j<n_classes; j++)
		features->dense_dot_range(Aw.col(j).data(), 0, n_vecs, NULL, w.col(j).data(), n_feats, 0.0);
	MatrixXd As  = MatrixXd::Zero(n_vecs, n_classes);
	MatrixXd Awp = MatrixXd::Zero(n_vecs, n_classes);
	// gradients
	MatrixXd g   = MatrixXd::Zero(n_feats, n_classes);
	VectorXd gc  = VectorXd::Zero(n_classes);
	// projection
	MatrixXd v   = MatrixXd::Zero(n_feats, n_classes);

	// Lipschitz continuous gradient parameter for line search
	double L = 1.0/(n_vecs*n_classes);
	// coefficients for search point computation
	double alphap = 0, alpha = 1;

	// lambda regularization parameter
	double lambda = z;
	// objective values
	double objective = 0.0;
	double objective_p = 0.0;

	int iter = 0;
	bool done = false;
	CTime time;
	//internal::set_is_malloc_allowed(false);
	while ((!done) && (iter<options.max_iter) && (!CSignal::cancel_computations()))
	{
		double beta = (alphap-1)/alpha;
		// compute search points
		search_w = w + beta*wwp;
		search_c = c + beta*ccp;

		// update dot products with search point
		As = Aw + beta*(Aw-Awp);

		// compute objective and gradient at search point
		double fun_s = 0;
		g.setZero();
		gc.setZero();
		// for each vector
		for (i=0; i<n_vecs; i++)
		{
			// class of current vector
			int vec_class = labels_vector[i];
			// for each class
			for (j=0; j<n_classes; j++)
			{
				// compute logistic loss
				double aa = ((vec_class == j) ? -1.0 : 1.0)*(As(i,j) + search_c(j));
				double bb = aa > 0.0 ? aa : 0.0;
				// avoid underflow via log-sum-exp trick
				fun_s += CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb;
				double prob = 1.0/(1+CMath::exp(aa));
				double b = ((vec_class == j) ? -1.0 : 1.0)*(1-prob);///(n_vecs*n_classes);
				// update gradient of intercepts
				gc[j] += b;
				// update gradient of weight vectors
				features->add_to_dense_vec(b, i, g.col(j).data(), n_feats);
			}
		}
		//fun_s /= (n_vecs*n_classes);

		wp = w;
		Awp = Aw;
		cp = c;

		int inner_iter = 0;
		double fun_x = 0;

		// line search process
		while (inner_iter<5000)
		{
			// compute line search point
			v = search_w - g/L;
			c = search_c - gc/L;

			// compute projection of gradient
			eppMatrix(w.data(),v.data(),n_feats,n_classes,lambda/L,options.q);

			v = w - search_w;

			// update dot products
			for (j=0; j<n_classes; j++)
				features->dense_dot_range(Aw.col(j).data(), 0, n_vecs, NULL, w.col(j).data(), n_feats, 0.0);

			// compute objective at search point
			fun_x = 0;
			for (i=0; i<n_vecs; i++)
			{
				int vec_class = labels_vector[i];
				for (j=0; j<n_classes; j++)
				{
					double aa = ((vec_class == j) ? -1.0 : 1.0)*(Aw(i,j) + c(j));
					double bb = aa > 0.0 ? aa : 0.0;
					fun_x += CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb;
				}
			}
			//fun_x /= (n_vecs*n_classes);

			// check for termination of line search
			double r_sum = (v.squaredNorm() + (c-search_c).squaredNorm())/2;
			double l_sum = fun_x - fun_s - v.cwiseProduct(g).sum() - (c-search_c).dot(gc);

			// stop if projected gradient is less than 1e-20
			if (r_sum <= 1e-20)
			{
				SG_SINFO("Gradient step makes little improvement (%f)\n",r_sum)
				done = true;
				break;
			}

			if (l_sum <= r_sum*L)
				break;
			else
				L = CMath::max(2*L, l_sum/r_sum);

			inner_iter++;
		}

		// update alpha coefficients
		alphap = alpha;
		alpha = (1+CMath::sqrt(4*alpha*alpha+1))/2;

		// update wwp and ccp
		wwp = w - wp;
		ccp = c - cp;

		// update objectives
		objective_p = objective;
		objective = fun_x;

		// regularize objective with tree norm
		double L1q_norm = 0.0;
		for (int m=0; m<n_classes; m++)
			L1q_norm += w.col(m).norm();
		objective += lambda*L1q_norm;

		//cout << "Objective = " << objective << endl;

		// check for termination of whole process
		if ((CMath::abs(objective - objective_p) < options.tolerance*CMath::abs(objective_p)) && (iter>2))
		{
			SG_SINFO("Objective changes less than tolerance\n")
			done = true;
		}

		iter++;
	}
	SG_SINFO("%d iterations passed, objective = %f\n",iter,objective)
	//internal::set_is_malloc_allowed(true);

	// output computed weight vectors and intercepts
	SGMatrix<float64_t> r_w(n_feats,n_classes);
	for (j=0; j<n_classes; j++)
	{
		for (i=0; i<n_feats; i++)
			r_w(i,j) = w(i,j);
	}
	//r_w.display_matrix();
	SGVector<float64_t> r_c(n_classes);
	for (j=0; j<n_classes; j++)
		r_c[j] = c[j];
	return slep_result_t(r_w, r_c);
};