//[[Rcpp::export]] List SPMBgraphsqrt(Eigen::Map<Eigen::MatrixXd> data, NumericVector &lambda, int nlambda, int d, NumericVector &x, IntegerVector &col_cnz, IntegerVector &row_idx) { Eigen::ArrayXd Xb, r, grad, w1, Y, XX, gr; Eigen::ArrayXXd X; Eigen::MatrixXd tmp_icov; tmp_icov.resize(d, d); tmp_icov.setZero(); std::vector<Eigen::MatrixXd > tmp_icov_p; tmp_icov_p.clear(); for(int i = 0; i < nlambda; i++) tmp_icov_p.push_back(tmp_icov); int n = data.rows(); X = data; XX.resize(d); for (int j = 0; j < d; j++) XX[j] = (X.col(j)*X.col(j)).sum()/n; double prec = 1e-4; int max_iter = 1000; int num_relaxation_round = 3; int cnz = 0; for(int m=0;m<d;m++) { Xb.resize(n); Xb.setZero(); grad.resize(d); grad.setZero(); gr.resize(d); gr.setZero(); w1.resize(d); w1.setZero(); r.resize(n); r.setZero(); Y = X.col(m); Eigen::ArrayXd Xb_master(n); Eigen::ArrayXd w1_master(n); std::vector<int> actset_indcat(d, 0); std::vector<int> actset_indcat_master(d, 0); std::vector<int> actset_idx; std::vector<double> old_coef(d, 0); std::vector<double> grad(d, 0); std::vector<double> grad_master(d, 0); double a = 0, g = 0, L = 0, sum_r = 0, sum_r2 = 0; double tmp_change = 0, local_change = 0; r = Y - Xb; sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); double dev_thr = fabs(L) * prec; //cout<<dev_thr<<endl; for(int i = 0; i < d; i++) { grad[i] = (r * X.col(i)).sum() / (n*L); } for(int i = 0; i < d; i++) gr[i] = abs(grad[i]); w1_master = w1; Xb_master = Xb; for (int i = 0; i < d; i++) grad_master[i] = gr[i]; std::vector<double> stage_lambdas(d, 0); for(int i=0;i<nlambda;i++) { double ilambda = lambda[i]; w1 = w1_master; Xb = Xb_master; for (int j = 0; j < d; j++) { gr[j] = grad_master[j]; actset_indcat[j] = actset_indcat_master[j]; } // init the active set double threshold; if (i > 0) threshold = 2 * lambda[i] - lambda[i - 1]; else threshold = 2 * lambda[i]; for (int j = 0; j < m; ++j) { stage_lambdas[j] = lambda[i]; if (gr[j] > threshold) actset_indcat[j] = 1; } for (int j = m+1; j < d; ++j) { stage_lambdas[j] = lambda[i]; if (gr[j] > threshold) actset_indcat[j] = 1; } stage_lambdas[m] = lambda[i]; r = Y - Xb; sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); // loop level 0: multistage convex relaxation int loopcnt_level_0 = 0; int idx; double old_w1, updated_coord; while (loopcnt_level_0 < num_relaxation_round) { loopcnt_level_0++; // loop level 1: active set update int loopcnt_level_1 = 0; bool terminate_loop_level_1 = true; while (loopcnt_level_1 < max_iter) { loopcnt_level_1++; terminate_loop_level_1 = true; for (int j = 0; j < d; j++) old_coef[j] = w1[j]; // initialize actset_idx actset_idx.clear(); for (int j = 0; j < m; j++) if (actset_indcat[j]) { g = 0.0; a = 0.0; double tmp; sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); Eigen::ArrayXd wXX = (1 - r*r/sum_r2) * X.col(j) * X.col(j); g = (wXX * w1[j] + r * X.col(j)).sum()/(n*L); a = wXX.sum()/(n*L); tmp = w1[j]; w1[j] = thresholdl1(g, stage_lambdas[j]) / a; tmp = w1[j] - tmp; // Xb += delta*X[idx*n] Xb = Xb + tmp * X.col(j); sum_r = 0.0; sum_r2 = 0.0; // r -= delta*X r = r - tmp * X.col(j); sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); updated_coord = w1[j]; if (fabs(updated_coord) > 0) actset_idx.push_back(j); } for (int j = m+1; j < d; j++) if (actset_indcat[j]) { g = 0.0; a = 0.0; double tmp; sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); Eigen::ArrayXd wXX = (1 - r*r/sum_r2) * X.col(j) * X.col(j); g = (wXX * w1[j] + r * X.col(j)).sum()/(n*L); a = wXX.sum()/(n*L); tmp = w1[j]; w1[j] = thresholdl1(g, stage_lambdas[j]) / a; tmp = w1[j] - tmp; // Xb += delta*X[idx*n] Xb = Xb + tmp * X.col(j); sum_r = 0.0; sum_r2 = 0.0; // r -= delta*X r = r - tmp * X.col(j); sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); updated_coord = w1[j]; if (fabs(updated_coord) > 0) actset_idx.push_back(j); } // loop level 2: proximal newton on active set int loopcnt_level_2 = 0; bool terminate_loop_level_2 = true; while (loopcnt_level_2 < max_iter) { loopcnt_level_2++; terminate_loop_level_2 = true; for (int k = 0; k < actset_idx.size(); k++) { idx = actset_idx[k]; old_w1 = w1[idx]; g = 0.0; a = 0.0; double tmp; sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); Eigen::ArrayXd wXX = (1 - r*r/sum_r2) * X.col(idx) * X.col(idx); g = (wXX * w1[idx] + r * X.col(idx)).sum()/(n*L); a = wXX.sum()/(n*L); tmp = w1[idx]; w1[idx] = thresholdl1(g, stage_lambdas[idx]) / a; tmp = w1[idx] - tmp; // Xb += delta*X[idx*n] Xb = Xb + tmp * X.col(idx); sum_r = 0.0; sum_r2 = 0.0; // r -= delta*X r = r - tmp * X.col(idx); sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); updated_coord = w1[idx]; tmp_change = old_w1 - w1[idx]; double a = (X.col(idx) * X.col(idx) * (1 - r * r/(L*L*n))).sum()/(n*L); local_change = a * tmp_change * tmp_change / (2 * L * n); if (local_change > dev_thr) terminate_loop_level_2 = false; } if (terminate_loop_level_2) break; } terminate_loop_level_1 = true; // check stopping criterion 1: fvalue change for (int k = 0; k < actset_idx.size(); ++k) { idx = actset_idx[k]; tmp_change = old_w1 - w1[idx]; double a = (X.col(idx) * X.col(idx) * (1 - r * r/(L*L*n))).sum()/(n*L); local_change = a * tmp_change * tmp_change / (2 * L * n); if (local_change > dev_thr) terminate_loop_level_1 = false; } r = Y - Xb; sum_r = r.sum(); sum_r2 = r.matrix().dot(r.matrix()); L = sqrt(sum_r2 / n); if (terminate_loop_level_1) break; // check stopping criterion 2: active set change bool new_active_idx = false; for (int k = 0; k < m; k++) if (actset_indcat[k] == 0) { grad[idx] = (r * X.col(idx)).sum() / (n*L); //cout<<grad[idx]; gr[k] = fabs(grad[k]); if (gr[k] > stage_lambdas[k]) { actset_indcat[k] = 1; new_active_idx = true; } } for (int k = m+1; k < d; k++) if (actset_indcat[k] == 0) { grad[idx] = (r * X.col(idx)).sum() / (n*L); //cout<<grad[idx] gr[k] = fabs(grad[k]); if (gr[k] > stage_lambdas[k]) { actset_indcat[k] = 1; new_active_idx = true; } } if(!new_active_idx) break; } if (loopcnt_level_0 == 1) { for (int j = 0; j < d; j++) { w1_master[j] = w1[j]; grad_master[j] = gr[j]; actset_indcat_master[j] = actset_indcat[j]; } for (int j = 0; j < n; j++) Xb_master[j] = Xb[j]; } } for(int j=0;j<actset_idx.size();j++) { int w_idx = actset_idx[j]; x[cnz] = w1[w_idx]; row_idx[cnz] = i*d+w_idx; cnz++; //cout<<cnz<<" "; } double tal = 0; Eigen::MatrixXd temp; temp.resize(n, 1); for(int j = 0; j < n; j++) { temp(j, 0) = 0; for(int k = 0; k < d; k++) temp(j, 0) += X.matrix()(j, k)*w1[k]; temp(j, 0) = Y[j] - temp(j, 0); } //temp = Y.matrix() - X.matrix().transpose()*w1.matrix(); for(int j = 0; j < n; j++) tal += temp(j, 0)*temp(j, 0); tal = sqrt(tal)/sqrt(n); tmp_icov = tmp_icov_p[i]; tmp_icov(m, m) = pow(tal, -2); for(int j = 0; j < m; j++) tmp_icov(j, m) = -tmp_icov(m, m)*w1[j]; for(int j = m+1; j < d; j++) tmp_icov(j, m) = -tmp_icov(m, m)*w1[j]; tmp_icov_p[i] = tmp_icov; } col_cnz[m+1]=cnz; } for(int i = 0; i < nlambda; i++) tmp_icov_p[i] = (tmp_icov_p[i].transpose()+tmp_icov_p[i])/2; return List::create( _["col_cnz"] = col_cnz, _["row_idx"] = row_idx, _["x"] = x, _["icov"] = tmp_icov_p ); }
int main() { //initialisation Eigen::ArrayXXd R = Eigen::ArrayXXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); //matrix R, dimension: Nf x Nw std::default_random_engine generator; std::normal_distribution<double> distr(0.0, 0.1); //initialise the matrix R with small, random, non-zero numbers for (int i = 0; i < R.rows(); i++) { for (int j = 0; j < R.cols(); j++) { double d = distr(generator); //avoid having 0's as the weight, try to get all the coefficients initialised with small numbers while (d == 0.0) { d = distr(generator); } R(i, j) = d; } } //initialise the matrix Q, of size feature_size x (no_of_postags * no_of_distance) Eigen::ArrayXXd Q = Eigen::ArrayXXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); for (int i = 0; i < Q.rows(); i++) { for (int j = 0; j < Q.cols(); j++) { double d = distr(generator); //avoid having 0's as the weight, try to get all the coefficients initialised with small numbers while (d == 0.0) { d = distr(generator); } Q(i, j) = d; } } //create an array of Ci (where each Ci is a matrix of Nf x Nf that governs the interaction between the ith feature and vn) std::vector<Eigen::ArrayXXd> Ci; for (int i = 0; i < 5; i++) { Eigen::ArrayXXd Ci_temp = Eigen::ArrayXXd::Zero(FEATURE_SIZE, FEATURE_SIZE); std::default_random_engine generator; std::normal_distribution<double> distr(0.0, 0.1); for (int j = 0; j < Ci_temp.rows(); j++) { for (int k = 0; k < Ci_temp.cols(); k++) { double d = distr(generator); //avoid having 0's as the weight, try to get all the coefficients initialised with small numbers while (d == 0.0) { d = distr(generator); } Ci_temp(j, k) = d; } } Ci.push_back(Ci_temp); } //initialise vi. We use a context of 5 (the postag of the head itself, the postag before the modifier, the postag after the modifier, the postag before the head, and the postag after the head) //initialise vn. Both vi and vn are column vectors with all 0's and a single 1, which indicates the POS-tag we desire to get //FOR NOW, we ignore the term br^transpose * R^transpose*vn, but find out what it is //initialise the bias term bv Eigen::ArrayXd bv; //print the array of biases into an external file std::ofstream final_postag_bias("final_postag_bias.txt"); if (!final_postag_bias.is_open()) { std::cout << "fail to create final postag bias file" << std::endl; } /*std::map<std::string, int> big_pos_tag_map = get_postag_bias(bv, "distance_count.txt", "fine_pos_tags.txt"); std::cout << "size of big pos tag map = " << big_pos_tag_map.size() << std::endl; for (std::map<std::string, int>::iterator it = big_pos_tag_map.begin(); it != big_pos_tag_map.end(); it++) { final_postag_bias << "Pos tag " << it->first << " has bias value " << bv(it->second) << std::endl; } */ /* create a pair between each "target pos tag" with a bias value (tested and correct)*/ //Eigen::ArrayXd word_bias_pair = Eigen::ArrayXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE); int i_3 = -1; std::ifstream fine_pos_tags("fine_pos_tags.txt"); std::string temp_string; /* while (getline(fine_pos_tags, temp_string)) { for (int i = -MAX_DISTANCE; i <= MAX_DISTANCE; i++) { if (i != 0) { if (i == -49 || i == -45 || i == -40 || i == -35 || i == -30 || i == -25 || i == -20 || i == -15 || i == -10 || i == -5 || i == -4 || i == -3 || i == -2 || i == -1 || i == 1 || i == 2 || i == 3 || i == 4 || i == 5 || i == 6 || i == 11 || i == 16 || i == 21 || i == 26 || i == 31 || i == 36|| i == 41 || i == 46) { i_3++; std::stringstream sstm; sstm << temp_string << i; if (big_pos_tag_map.find(sstm.str()) == big_pos_tag_map.end()) { std::cout << "ERROR" << std::endl; } else { int pos_tag_idx = big_pos_tag_map[sstm.str()]; //word_bias_pair(i_3) = bv(pos_tag_idx); } } } } } */ /* end */ //main training part std::map<std::string, int> mapping = postag_mapping("fine_pos_tags.txt"); fine_pos_tags.close(); //print out both the small and big mapping //std::ofstream big_pos_tag_mapping("big_pos_tag_map.txt"); std::ofstream small_pos_tag_mapping("small_pos_tag_map.txt"); /* for (std::map<std::string, int>::iterator it = big_pos_tag_map.begin(); it != big_pos_tag_map.end(); it++) { big_pos_tag_mapping << it->first << " " << it->second << std::endl; } */ for (std::map<std::string, int>::iterator it = mapping.begin(); it != mapping.end(); it++) { small_pos_tag_mapping << it->first << " " << it->second << std::endl; } small_pos_tag_mapping.close(); //create the pairs std::ifstream pairs ("real_pairs_medium.txt"); //initialise the gradient matrix std::vector<Eigen::MatrixXd> dCi; for (int i = 0; i < 5; i++) { Eigen::MatrixXd temp = Eigen::MatrixXd::Zero(FEATURE_SIZE, FEATURE_SIZE); dCi.push_back(temp); } Eigen::MatrixXd dR = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::MatrixXd dQ = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); std::ofstream parameters ("parameters_each_epoch_small_trial_1_check_R.txt"); if (!parameters.is_open()) { std::cout << "Error: output file not opening" << std::endl; } //initialise all the temporary variables outside the loop to save time on memory allocation on the heap (expensive) //all these temporary variables are reset to zero using .setZero() method at the end of each loop Eigen::MatrixXd tempC = Eigen::MatrixXd::Zero(FEATURE_SIZE, FEATURE_SIZE); Eigen::MatrixXd temp_R_Model = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::MatrixXd temp_R_Data = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::MatrixXd temp_Q = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::MatrixXd temp_Q_Model = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::MatrixXd temp_Q_Data = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); Eigen::VectorXd temp_Qw = Eigen::VectorXd::Zero(FEATURE_SIZE); //Eigen::VectorXd dbias = Eigen::VectorXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE); //Eigen::VectorXd temp_dbias = Eigen::VectorXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE); Eigen::VectorXd tempCr = Eigen::VectorXd::Zero(FEATURE_SIZE); Eigen::ArrayXd temp_e = Eigen::ArrayXd::Zero(NO_OF_POSTAGS); /* Eigen::MatrixXd temp_matrix = Eigen::MatrixXd::Zero(NO_OF_POSTAGS, FEATURE_SIZE); Eigen::MatrixXd temp_matrix_2 = Eigen::MatrixXd::Zero(NO_OF_POSTAGS, FEATURE_SIZE); */ //calculate the normalising constant for the first time std::ofstream gradient_check_R("gradient_check_R_big.txt"); Eigen::MatrixXd curr_dR = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); //DELETE THIS LATER int no_of_iter = 0; for (int i = 0; i < NO_OF_EPOCH; i++) { if (!pairs.is_open()) { std::cout << "Fail to open input file" << std::endl; } else { std::vector<std::string> temp; std::string line; while (getline(pairs, line)) { if (line.size() > 1) { no_of_iter++; temp = split(line, ' '); //initialise an array with the elements as the indices of: postag of the modifier, postag of the head, postag of before modifier, postag of after modifier, postag of before head, postag of after head int arr_temp[6] = {mapping[temp[0]], mapping[temp[1]], mapping[temp[2]], mapping[temp[3]], mapping[temp[4]], mapping[temp[5]]}; if (no_of_iter % 10000 == 0) { std::cout << no_of_iter << std::endl; } //calculate the normalising constant double normalising_constant = 0.0; for (int iter2 = 0; iter2 < 5; iter2++) { tempCr += (Ci[iter2].matrix() * R.col(arr_temp[iter2+1]).matrix()); //prediction } for (int iter = 0; iter < Q.cols(); iter++) { double temp = exp((tempCr.transpose() * Q.col(iter).matrix()));//dot product temp_e(iter) = temp; normalising_constant += temp; } //calculate all the gradient changes for each of the Ci, from dCi[0] to dCi[4] //this operation is already efficient for (int j = 0; j < 5; j++) { for (int k = 0; k < Q.cols(); k++) { tempC += (temp_e(k) * (Q.col(k).matrix() * R.col(arr_temp[j+1]).transpose().matrix())); } //normalise tempC /= normalising_constant; dCi[j] += (Q.col(arr_temp[0]).matrix() * R.col(arr_temp[j+1]).transpose().matrix() - tempC); //reset the matrix tempC to prepare to calculate the next dCi[j] tempC.setZero(); } //calculate dQ VERIFY for (int j = 0; j < 5; j++) { temp_Qw += (Ci[j].matrix() * R.col(arr_temp[j+1]).matrix()); } for (int j = 0; j < Q.cols(); j++) { temp_Q_Model.col(j) += (temp_e(j) * temp_Qw); } temp_Q_Model /= normalising_constant; temp_Q_Data.col(arr_temp[0]) = temp_Qw; dQ += (temp_Q_Data - temp_Q_Model); temp_Qw.setZero(); temp_Q_Model.setZero(); temp_Q_Data.setZero(); //calculate dR //calculate dR with respect to data for (int j = 0; j < 5; j++) { temp_R_Data.col(arr_temp[j+1]) += Ci[j].transpose().matrix() * Q.col(arr_temp[0]).matrix(); } //calculate dR with respect to model for (int k = 0; k < Q.cols(); k++) { for (int j = 0; j < 5; j++) { temp_R_Model.col(arr_temp[j+1]) += (temp_e(k) * (Ci[j].transpose().matrix() * Q.col(k).matrix())); } } temp_R_Model /= normalising_constant; dR += (temp_R_Data - temp_R_Model); if (no_of_iter % 1000 == 0) { curr_dR = temp_R_Data - temp_R_Model; } temp_R_Data.setZero(); temp_R_Model.setZero(); //calculate dbias //temp_dbias(arr_temp[0]) = 1.0; //dbias += (temp_dbias - temp_e.matrix() / normalising_constant); //temp_dbias(arr_temp[0]) = 0.0; tempCr.setZero(); temp_e.setZero(); if (no_of_iter % 1000 == 0) { Eigen::VectorXd temp = Eigen::VectorXd::Zero(FEATURE_SIZE); gradient_check_R << "current data point = " << no_of_iter <<std::endl; for (int j = 0; j < R.rows(); j++) { for (int k = 0; k < R.cols(); k++) { Eigen::ArrayXXd R_copy = R; double result_1 = 0.0; double result_2 = 0.0; double temp_double = 0.0; gradient_check_R << curr_dR(j, k) << " "; R_copy(j, k) += EPSILON; for (int iter2 = 0; iter2 < 5; iter2++) { temp += (Ci[iter2].matrix() * R_copy.col(arr_temp[iter2+1]).matrix()); } result_1 += temp.transpose() * Q.col(arr_temp[0]).matrix(); for (int l = 0; l < Q.cols(); l++) { temp_double += exp(temp.transpose() * Q.col(l).matrix()); } result_1 -= log(temp_double); temp_double = 0.0; //calculate result_2 R_copy(j, k) -= (2 * EPSILON); temp.setZero(); for (int iter2 = 0; iter2 < 5; iter2++) { temp += (Ci[iter2].matrix() * R_copy.col(arr_temp[iter2 + 1]).matrix()); } result_2 += temp.transpose() * Q.col(arr_temp[0]).matrix(); for (int l = 0; l < Q.cols(); l++) { temp_double += exp(temp.transpose() * Q.col(l).matrix()); } result_2 -= log(temp_double); temp.setZero(); gradient_check_R << (result_1 - result_2) / (2 * EPSILON) << std::endl; } } curr_dR.setZero(); } //update the gradient if no_of_iter mod mini batch size == 0, reset the gradient matrix to 0 if (no_of_iter % BATCH_SIZE == 0) { double curr_learning_rate = LEARNING_RATE / (1 + no_of_iter * LEARNING_DECAY); for (int j = 0; j < 5; j++) { Ci[j] = Ci[j].matrix() + curr_learning_rate * dCi[j]; //reset the matrix dCi[j] dCi[j].setZero(); } //word_bias_pair = word_bias_pair + curr_learning_rate * dbias.array(); R = R.matrix() + curr_learning_rate * dR; Q = Q.matrix() + curr_learning_rate * dQ; dR.setZero(); dQ.setZero(); //dbias.setZero(); } } temp.clear(); } pairs.close(); } parameters << "Epoch number: " << i+1 << std::endl; for (int j = 0; j < 5; j++) { parameters << "Matrix C" << j+1 << std::endl; parameters << Ci[j] << std::endl; parameters << std::endl; parameters << "End of matrix C" << j+1 << std::endl; } parameters << "Matrix R" << std::endl; parameters << R << std::endl; parameters << "End of matrix R" << std::endl; parameters << "Matrix Q" << std::endl; parameters << Q << std::endl; parameters << "End of matrix Q" << std::endl; //parameters << "Bias vector" << std::endl; //parameters << word_bias_pair << std::endl; //parameters << "End of bias vector" << std::endl; pairs.open("real_pairs_medium.txt"); } pairs.close(); parameters.close(); return 0; }