slep_result_t malsar_low_rank( CDotFeatures* features, double* y, double rho, const slep_options& options) { int task; int n_feats = features->get_dim_feature_space(); SG_SDEBUG("n feats = %d\n", n_feats); int n_vecs = features->get_num_vectors(); SG_SDEBUG("n vecs = %d\n", n_vecs); int n_tasks = options.n_tasks; SG_SDEBUG("n tasks = %d\n", n_tasks); int iter = 0; // initialize weight vector and bias for each task MatrixXd Ws = MatrixXd::Zero(n_feats, n_tasks); VectorXd Cs = VectorXd::Zero(n_tasks); for (task=0; task<n_tasks; task++) { int n_pos = 0; int n_neg = 0; for (int i=options.ind[task]; i<options.ind[task+1]; i++) { if (y[i] > 0) n_pos++; else n_neg++; } Cs[task] = CMath::log(double(n_pos)/n_neg); } MatrixXd Wz=Ws, Wzp=Ws, Wz_old=Ws, delta_Wzp=Ws, gWs=Ws; VectorXd Cz=Cs, Czp=Cs, Cz_old=Cs, delta_Czp=Cs, gCs=Cs; double t=1, t_old=0; double gamma=1, gamma_inc=2; double obj=0.0, obj_old=0.0; internal::set_is_malloc_allowed(false); bool done = false; while (!done && iter <= options.max_iter) { double alpha = double(t_old - 1)/t; // compute search point Ws = (1+alpha)*Wz - alpha*Wz_old; Cs = (1+alpha)*Cz - alpha*Cz_old; // zero gradient gWs.setZero(); gCs.setZero(); // compute gradient and objective at search point double Fs = 0; for (task=0; task<n_tasks; task++) { for (int i=options.ind[task]; i<options.ind[task+1]; i++) { double aa = -y[i]*(features->dense_dot(i, Ws.col(task).data(), n_feats)+Cs[task]); double bb = CMath::max(aa,0.0); // avoid underflow when computing exponential loss Fs += (CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb)/n_vecs; double b = -y[i]*(1 - 1/(1+CMath::exp(aa)))/n_vecs; gCs[task] += b; features->add_to_dense_vec(b, i, gWs.col(task).data(), n_feats); } } gWs.noalias() += 2*rho*Ws; // add regularizer Fs += rho*Ws.squaredNorm(); double Fzp = 0.0; // line search, Armijo-Goldstein scheme while (true) { // compute trace projection of Ws - gWs/gamma with 2*rho/gamma internal::set_is_malloc_allowed(true); Wzp.setZero(); JacobiSVD<MatrixXd> svd(Ws - gWs/gamma,ComputeThinU | ComputeThinV); for (int i=0; i<svd.singularValues().size(); i++) { if (svd.singularValues()[i] > 2*rho/gamma) Wzp += svd.matrixU().col(i)* svd.singularValues()[i]* svd.matrixV().col(i).transpose(); } internal::set_is_malloc_allowed(false); // walk in direction of antigradient Czp = Cs - gCs/gamma; // compute objective at line search point Fzp = 0.0; for (task=0; task<n_tasks; task++) { for (int i=options.ind[task]; i<options.ind[task+1]; i++) { double aa = -y[i]*(features->dense_dot(i, Wzp.col(task).data(), n_feats)+Cs[task]); double bb = CMath::max(aa,0.0); Fzp += (CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb)/n_vecs; } } Fzp += rho*Wzp.squaredNorm(); // compute delta between line search point and search point delta_Wzp = Wzp - Ws; delta_Czp = Czp - Cs; // norms of delta double nrm_delta_Wzp = delta_Wzp.squaredNorm(); double nrm_delta_Czp = delta_Czp.squaredNorm(); double r_sum = (nrm_delta_Wzp + nrm_delta_Czp)/2; double Fzp_gamma = Fs + (delta_Wzp.transpose()*gWs).trace() + (delta_Czp.transpose()*gCs).trace() + (gamma/2)*nrm_delta_Wzp + (gamma/2)*nrm_delta_Czp; // break if delta is getting too small if (r_sum <= 1e-20) { done = true; break; } // break if objective at line searc point is smaller than Fzp_gamma if (Fzp <= Fzp_gamma) break; else gamma *= gamma_inc; } Wz_old = Wz; Cz_old = Cz; Wz = Wzp; Cz = Czp; // compute objective value obj_old = obj; obj = Fzp; internal::set_is_malloc_allowed(true); JacobiSVD<MatrixXd> svd(Wzp, EigenvaluesOnly); obj += rho*svd.singularValues().sum(); internal::set_is_malloc_allowed(false); // check if process should be terminated switch (options.termination) { case 0: if (iter>=2) { if ( CMath::abs(obj-obj_old) <= options.tolerance ) done = true; } break; case 1: if (iter>=2) { if ( CMath::abs(obj-obj_old) <= options.tolerance*CMath::abs(obj_old)) done = true; } break; case 2: if (CMath::abs(obj) <= options.tolerance) done = true; break; case 3: if (iter>=options.max_iter) done = true; break; } iter++; t_old = t; t = 0.5 * (1 + CMath::sqrt(1.0 + 4*t*t)); } SG_SDEBUG("%d iteration passed, objective = %f\n",iter,obj); SGMatrix<float64_t> tasks_w(n_feats, n_tasks); for (int i=0; i<n_feats; i++) { for (task=0; task<n_tasks; task++) tasks_w[i] = Wzp(i,task); } SGVector<float64_t> tasks_c(n_tasks); for (int i=0; i<n_tasks; i++) tasks_c[i] = Czp[i]; return slep_result_t(tasks_w, tasks_c); };
slep_result_t slep_mc_plain_lr( CDotFeatures* features, CMulticlassLabels* labels, float64_t z, const slep_options& options) { int i,j; // obtain problem parameters int n_feats = features->get_dim_feature_space(); int n_vecs = features->get_num_vectors(); int n_classes = labels->get_num_classes(); // labels vector containing values in range (0 .. n_classes) SGVector<float64_t> labels_vector = labels->get_labels(); // initialize matrices and vectors to be used // weight vector MatrixXd w = MatrixXd::Zero(n_feats, n_classes); // intercepts (biases) VectorXd c = VectorXd::Zero(n_classes); if (options.last_result) { SGMatrix<float64_t> last_w = options.last_result->w; SGVector<float64_t> last_c = options.last_result->c; for (i=0; i<n_classes; i++) { c[i] = last_c[i]; for (j=0; j<n_feats; j++) w(j,i) = last_w(j,i); } } // iterative process matrices and vectors MatrixXd wp = w, wwp = MatrixXd::Zero(n_feats, n_classes); VectorXd cp = c, ccp = VectorXd::Zero(n_classes); // search point weight vector MatrixXd search_w = MatrixXd::Zero(n_feats, n_classes); // search point intercepts VectorXd search_c = VectorXd::Zero(n_classes); // dot products MatrixXd Aw = MatrixXd::Zero(n_vecs, n_classes); for (j=0; j<n_classes; j++) features->dense_dot_range(Aw.col(j).data(), 0, n_vecs, NULL, w.col(j).data(), n_feats, 0.0); MatrixXd As = MatrixXd::Zero(n_vecs, n_classes); MatrixXd Awp = MatrixXd::Zero(n_vecs, n_classes); // gradients MatrixXd g = MatrixXd::Zero(n_feats, n_classes); VectorXd gc = VectorXd::Zero(n_classes); // projection MatrixXd v = MatrixXd::Zero(n_feats, n_classes); // Lipschitz continuous gradient parameter for line search double L = 1.0/(n_vecs*n_classes); // coefficients for search point computation double alphap = 0, alpha = 1; // lambda regularization parameter double lambda = z; // objective values double objective = 0.0; double objective_p = 0.0; int iter = 0; bool done = false; CTime time; //internal::set_is_malloc_allowed(false); while ((!done) && (iter<options.max_iter) && (!CSignal::cancel_computations())) { double beta = (alphap-1)/alpha; // compute search points search_w = w + beta*wwp; search_c = c + beta*ccp; // update dot products with search point As = Aw + beta*(Aw-Awp); // compute objective and gradient at search point double fun_s = 0; g.setZero(); gc.setZero(); // for each vector for (i=0; i<n_vecs; i++) { // class of current vector int vec_class = labels_vector[i]; // for each class for (j=0; j<n_classes; j++) { // compute logistic loss double aa = ((vec_class == j) ? -1.0 : 1.0)*(As(i,j) + search_c(j)); double bb = aa > 0.0 ? aa : 0.0; // avoid underflow via log-sum-exp trick fun_s += CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb; double prob = 1.0/(1+CMath::exp(aa)); double b = ((vec_class == j) ? -1.0 : 1.0)*(1-prob);///(n_vecs*n_classes); // update gradient of intercepts gc[j] += b; // update gradient of weight vectors features->add_to_dense_vec(b, i, g.col(j).data(), n_feats); } } //fun_s /= (n_vecs*n_classes); wp = w; Awp = Aw; cp = c; int inner_iter = 0; double fun_x = 0; // line search process while (inner_iter<5000) { // compute line search point v = search_w - g/L; c = search_c - gc/L; // compute projection of gradient eppMatrix(w.data(),v.data(),n_feats,n_classes,lambda/L,options.q); v = w - search_w; // update dot products for (j=0; j<n_classes; j++) features->dense_dot_range(Aw.col(j).data(), 0, n_vecs, NULL, w.col(j).data(), n_feats, 0.0); // compute objective at search point fun_x = 0; for (i=0; i<n_vecs; i++) { int vec_class = labels_vector[i]; for (j=0; j<n_classes; j++) { double aa = ((vec_class == j) ? -1.0 : 1.0)*(Aw(i,j) + c(j)); double bb = aa > 0.0 ? aa : 0.0; fun_x += CMath::log(CMath::exp(-bb) + CMath::exp(aa-bb)) + bb; } } //fun_x /= (n_vecs*n_classes); // check for termination of line search double r_sum = (v.squaredNorm() + (c-search_c).squaredNorm())/2; double l_sum = fun_x - fun_s - v.cwiseProduct(g).sum() - (c-search_c).dot(gc); // stop if projected gradient is less than 1e-20 if (r_sum <= 1e-20) { SG_SINFO("Gradient step makes little improvement (%f)\n",r_sum) done = true; break; } if (l_sum <= r_sum*L) break; else L = CMath::max(2*L, l_sum/r_sum); inner_iter++; } // update alpha coefficients alphap = alpha; alpha = (1+CMath::sqrt(4*alpha*alpha+1))/2; // update wwp and ccp wwp = w - wp; ccp = c - cp; // update objectives objective_p = objective; objective = fun_x; // regularize objective with tree norm double L1q_norm = 0.0; for (int m=0; m<n_classes; m++) L1q_norm += w.col(m).norm(); objective += lambda*L1q_norm; //cout << "Objective = " << objective << endl; // check for termination of whole process if ((CMath::abs(objective - objective_p) < options.tolerance*CMath::abs(objective_p)) && (iter>2)) { SG_SINFO("Objective changes less than tolerance\n") done = true; } iter++; } SG_SINFO("%d iterations passed, objective = %f\n",iter,objective) //internal::set_is_malloc_allowed(true); // output computed weight vectors and intercepts SGMatrix<float64_t> r_w(n_feats,n_classes); for (j=0; j<n_classes; j++) { for (i=0; i<n_feats; i++) r_w(i,j) = w(i,j); } //r_w.display_matrix(); SGVector<float64_t> r_c(n_classes); for (j=0; j<n_classes; j++) r_c[j] = c[j]; return slep_result_t(r_w, r_c); };