int QP::line_search() { double eps = 1e-4; result = vector<double>(lb.size(), 1); printf("obj:%.5f\n", compute_obj()); for (int i=0; i<num_var; i++) { if (lb[i]+eps>ub[i]) continue; double best = compute_obj(); double best_val = lb[i]; double from = lb[i]; double to = ub[i]; if (from<-10) from = -10; if (to>10) to = 10; double step = (to-from)/10; for (double val=from; val<to; val+=step) { result[i] = val; double res = compute_obj(); if (res<best) { best_val = val; best = res; } } result[i] = best_val; //printf("obj:%.5f\n", compute_obj()); } }
void Solver_ALM::Solve(double *w, double *obj, Function_SOFTMAX *func) { /* int *perm = Malloc(int, l); double *obj = Malloc(double, max_iter+1); double *grad = Malloc(double, w_size); double *probability = Malloc(double, nr_class); double *norm_term = Malloc(double, nr_class); */ int *perm = new int[l]; double *grad_w = new double[w_size]; //double *obj = new double[max_iter+1]; //double *grad = new double[nr_class]; //double *output = new double[nr_class]; //double *probability = new double[nr_class]; //double *norm_term = new double[nr_class]; //double sum = 0; //int id; //double *v = new double[nr_class]; // variables for sub-problem double *d = new double[nr_class]; double rho; double rho_max = 5; double rho_init = 1; //double rho = 1e-1; double eps_lambda = 1e-2; double eps_G = 1e-2 * 2; double factor = 0.9; //bool line_search = false; bool newton; double beta = 1.00; // double eta = 0.3; //double eta = 0.1; // iris double eta = 0.2; // glass double eta_g_upper = 1e-4; double eta_g_lower = 1e-6; double G_norm; double G_norm_old; double G_gap; double sigma = 1e-1; double sub_obj; double sub_obj_new; double log_alpha; double term1; double term2; double *d_temp = new double[nr_class]; double *sk = new double[nr_class]; double *yk = new double[nr_class]; double G_temp; double yk_sum = 0; double sk_sum = 0; double *G_log_dif = new double[l]; //double *eta_list = new double[l]; //for(int i = 0; i < l; ++i) //{ //eta_list[i] = eta; //} /* // gradient descent double rho = 1e-6; */ double *q = new double[nr_class]; double *G = new double[nr_class]; double *GG = new double[nr_class]; double timer = 0; double grad_norm; double primal; double dual; double accuracy; double *alpha_new = new double [alpha_size]; double *G_log = new double [alpha_size]; int i; int iter_class; int id; feature_node *xi; int yi; int iter = 0; int inner_iter; int newton_iter; int j; int temp; int index; double value; double start; int order; double norm_id; double lambda; double alpha_temp; double alpha_old; double eta_temp; double sum_d = 0; double lambda_gap = 0; //initialize alpha for(i = 0; i < alpha_size; i++) { alpha[i] = pi / (1-nr_class); } for(id = 0; id < l; id++) { yi = prob->y[id]; alpha[id*nr_class+yi] = pi; } //initialize w //pre-calculate norm_list for(i = 0; i < w_size; i++) w[i] = 0; for(id = 0; id < l; id++) { xi = prob->x[id]; yi = prob->y[id]; while(xi->index != -1) { index = xi->index - 1; value = xi->value; //calculate norm_list norm_list[id] += value * value; for(iter_class = 0; iter_class < nr_class; iter_class++) { w[index*nr_class+iter_class] += alpha[id*nr_class+iter_class] * value; } ++xi; } } //compute objective obj[0] = compute_obj(w); for(; iter < max_iter; iter++) //for(; iter < 1; iter++) { start = clock(); //rho *= 0.9999; beta = std::max(0.00001, beta * 0.75); // glass //if((iter+1)%10 == 0) //beta = std::max(0.0000001, beta * 0.1); //beta = std::max(0.001, beta * 0.85); iris //permutation for(i = 0; i < l; ++i) { perm[i] = i; } for(i = 0; i < l; ++i) { j = i + rand()%(l-i); //swap(perm[i], perm[j]); temp = perm[i]; perm[i] = perm[j]; perm[j] = temp; } //two-level dual block-coordinate descent //the outer level considers a block of variables corresponding an instance , i.e., alpha_i for(order = 0; order < l; ++order) { rho = rho_init; id = perm[order]; yi = prob->y[id]; norm_id = norm_list[id]; //solve the sub-problem //initialize d and lambda //compute q lambda = 0.0; for(i = 0; i < nr_class; ++i) { d[i] = 0; q[i] = 0; G[i] = 0; } newton = true; xi = prob->x[id]; while(xi->index != -1) { index = xi->index - 1; value = xi->value; for(i = 0; i < nr_class; ++i) { q[i] += w[index*nr_class+i] * value; } ++xi; } //inner loop for sub-problem for(inner_iter = 0; inner_iter < max_inner_iter; ++inner_iter, rho *= 1.1) { rho = std::min(rho, rho_max); //update d for(newton_iter = 0; newton_iter < max_newton_iter; ++newton_iter) { if(newton_iter == 1) { //newton = false; } if(newton_iter == 3) { newton = true; } //compute derivatives sum_d = 0; for(i = 0; i < nr_class; ++i) { sum_d += d[i]; } G_norm = 0; //sub_obj = 0; //term1 = 0; //term2 = 0; yk_sum = 0; for(i = 0; i < nr_class; ++i) { //compute first-order derivatives alpha_temp = alpha[id*nr_class+i]; G_temp = C*norm_id*d[i] + rho*sum_d; G_temp += C*q[i] + lambda; //if(inner_iter == 0 && newton_iter == 0) if (i == yi) { alpha_temp = 1 - alpha_temp - d[i]; } else { alpha_temp = -alpha_temp - d[i]; } //term1 += d[i] * d[i]; //term2 += (C*q[i]+lambda) * d[i]; log_alpha = log(alpha_temp); //sub_obj += alpha_temp * log_alpha; G_temp += 1 - log_alpha; //compute second-order derivatives //diagonal approximation G_norm += G_temp * G_temp; yk[i] = G_temp - G[i]; G[i] = G_temp; yk_sum += fabs(yk[i] * sk[i]); if(newton) { GG[i] = C * norm_list[id] + rho * 1 + 1 / alpha_temp; } } G_gap = fabs(G_norm-G_norm_old) / G_norm_old; if(G_gap <= eps_G) { break; } else { G_norm_old = G_norm; } //sub_obj += 0.5 * term1 * (C*norm_id*norm_id+rho) + term2; //update d //sub_obj_new = 0; //term1 = 0; //term2 = 0; for(i = 0; i < nr_class; ++i) { //d[i] -= rho * G[i]/GG[i]; //d[i] -= G[i]/GG[i]; //d[i] -= rho * G[i]; //double d_temp = d[i] - 1.0* G[i]/GG[i]; //beta = 0.5; // project //d_temp[i] = d[i] - eta * G[i]; if(newton) { d_temp[i] = d[i] - beta * eta * G[i] / GG[i]; } else { eta_temp = sk_sum / yk_sum; eta_temp = std::max(eta_temp, eta_g_lower); eta_temp = std::min(eta_temp, eta_g_upper); //eta_temp = eta_g; d_temp[i] = d[i] - beta * G[i] * eta_temp; } alpha_old = alpha[id*nr_class+i]; alpha_temp = alpha_old + d_temp[i]; //factor = newton_iter / (10.0+newton_iter); if (i == yi) { if (alpha_temp <= 0) { alpha_temp = alpha_old * (1-factor); } else if (alpha_temp >= 1) { alpha_temp = alpha_old*(1-factor) + factor; } //log_alpha = (1-alpha_temp) * log(1-alpha_temp); } else { if (alpha_temp >= 0) { alpha_temp = alpha_old * (1-factor); } else if (alpha_temp <= -1) { alpha_temp = alpha_old * (1-factor) - factor; } //log_alpha = (-alpha_temp) * log(-alpha_temp); } //alpha_new[i] = alpha_temp; d_temp[i] = alpha_temp - alpha_old; //term1 += d_temp[i] * d_temp[i]; //term2 += (C*q[i]+lambda) * d_temp[i]; //sub_obj_new += log_alpha; } //sub_obj_new += 0.5 * term1 * (C*norm_id*norm_id+rho) + term2; // line search sk_sum = 0; for(i = 0; i < nr_class; ++i) { sk[i] = d_temp[i] - d[i]; d[i] = d_temp[i]; //sk_sum += sk[i]*sk[i]/GG[i]; sk_sum += sk[i]*sk[i]; //d[i] = alpha_new[i] - alpha[id*nr_class+i]; } } if(debug_flag) std::cout << newton_iter << '\t' << G_gap << '\t' << lambda_gap << '\t' << clock()-start << std::endl; //update lambda sum_d = 0; for(i = 0; i < nr_class; ++i) { sum_d += d[i]; } lambda_gap = rho * sum_d; lambda += lambda_gap; if( fabs(lambda_gap) <= eps_lambda) break; } //end of inner loop if(debug_flag) std::cout << inner_iter << '\t' << G_gap << '\t' << lambda_gap << '\t' << clock()-start << std::endl; //update w xi = prob->x[id]; while(xi->index != -1) { index = xi->index - 1; value = xi->value; for(iter_class = 0; iter_class < nr_class; ++iter_class) { w[index*nr_class+iter_class] += d[iter_class] * value; } ++xi; } //update alpha for(iter_class = 0; iter_class < nr_class; ++iter_class) { alpha[id*nr_class+iter_class] += d[iter_class]; } } // one epoch of dual coordinate descent timer += clock() - start; primal = func->obj_primal(w); for(i = 0; i < alpha_size; ++i) { alpha_new[i] = -alpha[i]; } for(id = 0; id < l; ++id) { yi = prob->y[id]; alpha_new[id*nr_class+yi] = alpha[id*nr_class+yi] + 1; } dual = func->obj_dual(alpha_new, w); func->grad(w, grad_w, &grad_norm); accuracy = func->testing(w); std::cout << iter << '\t' << timer << '\t' << primal << '\t' << dual << '\t' << accuracy << '\t' << grad_norm << std::endl; //compute objective obj[iter+1] = compute_obj(w); /* if(iter>5 && (fabs(obj[iter+1]-obj[iter])/obj[iter] <= eps) ) { break; } */ } obj[iter+1] = -1; //delete [] norm_term; //delete [] probability; //delete [] grad; //delete [] obj; delete [] perm; delete [] grad_w; //delete [] output; delete [] G; delete [] GG; delete [] d; delete [] q; delete [] alpha_new; delete [] G_log; delete [] d_temp; delete [] G_log_dif; delete [] sk; delete [] yk; }