Ejemplo n.º 1
0
//[[Rcpp::export]]
List SPMBgraphsqrt(Eigen::Map<Eigen::MatrixXd> data, NumericVector &lambda, int nlambda, int d, NumericVector &x, IntegerVector &col_cnz, IntegerVector &row_idx)
{

    Eigen::ArrayXd Xb, r, grad, w1, Y, XX, gr;
    Eigen::ArrayXXd X;
    Eigen::MatrixXd tmp_icov;
    tmp_icov.resize(d, d);
    tmp_icov.setZero();
    std::vector<Eigen::MatrixXd > tmp_icov_p;
    tmp_icov_p.clear();
    for(int i = 0; i < nlambda; i++)
      tmp_icov_p.push_back(tmp_icov);
    int n = data.rows();
    X = data;
    XX.resize(d);
    for (int j = 0; j < d; j++)
      XX[j] = (X.col(j)*X.col(j)).sum()/n;
    double prec = 1e-4;
    int max_iter = 1000;
    int num_relaxation_round = 3;
	  int cnz = 0;
    for(int m=0;m<d;m++)
    {
      Xb.resize(n);
      Xb.setZero();
      grad.resize(d);
      grad.setZero();
      gr.resize(d);
      gr.setZero();
      w1.resize(d);
      w1.setZero();
      r.resize(n);
      r.setZero();
      Y = X.col(m);

      Eigen::ArrayXd Xb_master(n);
      Eigen::ArrayXd w1_master(n);
      std::vector<int> actset_indcat(d, 0);
      std::vector<int> actset_indcat_master(d, 0);
      std::vector<int> actset_idx;
      std::vector<double> old_coef(d, 0);
      std::vector<double> grad(d, 0);
      std::vector<double> grad_master(d, 0);

      double a = 0, g = 0, L = 0, sum_r = 0, sum_r2 = 0;
      double tmp_change = 0, local_change = 0;

      r = Y - Xb;
      sum_r = r.sum();
      sum_r2 = r.matrix().dot(r.matrix());
      L = sqrt(sum_r2 / n);

      double dev_thr = fabs(L) * prec;

      //cout<<dev_thr<<endl;


      for(int i = 0; i < d; i++)
      {
        grad[i] = (r * X.col(i)).sum() / (n*L);
      }
      for(int i = 0; i < d; i++) gr[i] = abs(grad[i]);
      w1_master = w1;
      Xb_master = Xb;
      for (int i = 0; i < d; i++) grad_master[i] = gr[i];

      std::vector<double> stage_lambdas(d, 0);

      for(int i=0;i<nlambda;i++)
      {
        double ilambda = lambda[i];
        w1 = w1_master;
        Xb = Xb_master;

        for (int j = 0; j < d; j++)
        {
          gr[j] = grad_master[j];
          actset_indcat[j] = actset_indcat_master[j];
        }

        // init the active set
        double threshold;
        if (i > 0)
          threshold = 2 * lambda[i] - lambda[i - 1];
        else
          threshold = 2 * lambda[i];

        for (int j = 0; j < m; ++j)
        {
          stage_lambdas[j] = lambda[i];

          if (gr[j] > threshold) actset_indcat[j] = 1;
        }
        for (int j = m+1; j < d; ++j)
        {
          stage_lambdas[j] = lambda[i];

          if (gr[j] > threshold) actset_indcat[j] = 1;
        }
        stage_lambdas[m] = lambda[i];
        r = Y - Xb;
        sum_r = r.sum();
        sum_r2 = r.matrix().dot(r.matrix());
        L = sqrt(sum_r2 / n);
        // loop level 0: multistage convex relaxation
        int loopcnt_level_0 = 0;
        int idx;
        double old_w1, updated_coord;
        while (loopcnt_level_0 < num_relaxation_round)
        {
          loopcnt_level_0++;

          // loop level 1: active set update
          int loopcnt_level_1 = 0;
          bool terminate_loop_level_1 = true;
          while (loopcnt_level_1 < max_iter)
          {
            loopcnt_level_1++;
            terminate_loop_level_1 = true;

            for (int j = 0; j < d; j++) old_coef[j] = w1[j];

            // initialize actset_idx
            actset_idx.clear();
            for (int j = 0; j < m; j++)
              if (actset_indcat[j])
              {
                g = 0.0;
                a = 0.0;

                double tmp;

                sum_r2 = r.matrix().dot(r.matrix());
                L = sqrt(sum_r2 / n);

                Eigen::ArrayXd wXX  = (1 - r*r/sum_r2) * X.col(j) * X.col(j);
                g = (wXX * w1[j] + r * X.col(j)).sum()/(n*L);
                a = wXX.sum()/(n*L);

                tmp = w1[j];
                w1[j] = thresholdl1(g, stage_lambdas[j]) / a;

                tmp = w1[j] - tmp;
                // Xb += delta*X[idx*n]
                Xb = Xb + tmp * X.col(j);

                sum_r = 0.0;
                sum_r2 = 0.0;
                // r -= delta*X
                r = r - tmp * X.col(j);
                sum_r = r.sum();

                sum_r2 = r.matrix().dot(r.matrix());
                L = sqrt(sum_r2 / n);

                updated_coord = w1[j];

                if (fabs(updated_coord) > 0) actset_idx.push_back(j);
              }

            for (int j = m+1; j < d; j++)
              if (actset_indcat[j])
              {
                  g = 0.0;
                  a = 0.0;

                  double tmp;

                  sum_r2 = r.matrix().dot(r.matrix());
                  L = sqrt(sum_r2 / n);

                  Eigen::ArrayXd wXX  = (1 - r*r/sum_r2) * X.col(j) * X.col(j);
                  g = (wXX * w1[j] + r * X.col(j)).sum()/(n*L);
                  a = wXX.sum()/(n*L);

                  tmp = w1[j];
                  w1[j] = thresholdl1(g, stage_lambdas[j]) / a;

                  tmp = w1[j] - tmp;
                  // Xb += delta*X[idx*n]
                  Xb = Xb + tmp * X.col(j);

                  sum_r = 0.0;
                  sum_r2 = 0.0;
                  // r -= delta*X
                  r = r - tmp * X.col(j);
                  sum_r = r.sum();

                  sum_r2 = r.matrix().dot(r.matrix());
                  L = sqrt(sum_r2 / n);

                  updated_coord = w1[j];

                  if (fabs(updated_coord) > 0) actset_idx.push_back(j);
                }

              // loop level 2: proximal newton on active set
            int loopcnt_level_2 = 0;
            bool terminate_loop_level_2 = true;
            while (loopcnt_level_2 < max_iter)
            {
              loopcnt_level_2++;
              terminate_loop_level_2 = true;

              for (int k = 0; k < actset_idx.size(); k++)
              {
                  idx = actset_idx[k];

                  old_w1 = w1[idx];
                  g = 0.0;
                  a = 0.0;

                  double tmp;

                  sum_r2 = r.matrix().dot(r.matrix());
                  L = sqrt(sum_r2 / n);

                  Eigen::ArrayXd wXX  = (1 - r*r/sum_r2) * X.col(idx) * X.col(idx);
                  g = (wXX * w1[idx] + r * X.col(idx)).sum()/(n*L);
                  a = wXX.sum()/(n*L);

                  tmp = w1[idx];
                  w1[idx] = thresholdl1(g, stage_lambdas[idx]) / a;

                  tmp = w1[idx] - tmp;
                  // Xb += delta*X[idx*n]
                  Xb = Xb + tmp * X.col(idx);

                  sum_r = 0.0;
                  sum_r2 = 0.0;
                  // r -= delta*X
                  r = r - tmp * X.col(idx);
                  sum_r = r.sum();

                  sum_r2 = r.matrix().dot(r.matrix());
                  L = sqrt(sum_r2 / n);

                  updated_coord = w1[idx];
                  tmp_change = old_w1 - w1[idx];
                  double a =  (X.col(idx) * X.col(idx) * (1 - r * r/(L*L*n))).sum()/(n*L);
                  local_change = a * tmp_change * tmp_change / (2 * L * n);
                  if (local_change > dev_thr)
                    terminate_loop_level_2 = false;
                }
              if (terminate_loop_level_2)
                break;
            }

            terminate_loop_level_1 = true;
              // check stopping criterion 1: fvalue change
            for (int k = 0; k < actset_idx.size(); ++k)
            {
              idx = actset_idx[k];
              tmp_change = old_w1 - w1[idx];
              double a =  (X.col(idx) * X.col(idx) * (1 - r * r/(L*L*n))).sum()/(n*L);
              local_change = a * tmp_change * tmp_change / (2 * L * n);
              if (local_change > dev_thr)
                terminate_loop_level_1 = false;
            }

            r = Y - Xb;
            sum_r = r.sum();
            sum_r2 = r.matrix().dot(r.matrix());
            L = sqrt(sum_r2 / n);

            if (terminate_loop_level_1)
              break;


              // check stopping criterion 2: active set change
            bool new_active_idx = false;
            for (int k = 0; k < m; k++)
              if (actset_indcat[k] == 0)
              {
                grad[idx] = (r * X.col(idx)).sum() / (n*L);
                //cout<<grad[idx];
                gr[k] = fabs(grad[k]);
                if (gr[k] > stage_lambdas[k])
                {
                  actset_indcat[k] = 1;
                  new_active_idx = true;
                }
              }
            for (int k = m+1; k < d; k++)
              if (actset_indcat[k] == 0)
              {
                grad[idx] = (r * X.col(idx)).sum() / (n*L);
                //cout<<grad[idx]
                gr[k] = fabs(grad[k]);
                if (gr[k] > stage_lambdas[k])
                {
                  actset_indcat[k] = 1;
                  new_active_idx = true;
                }
              }
            if(!new_active_idx)
              break;
          }
          if (loopcnt_level_0 == 1)
          {
            for (int j = 0; j < d; j++)
            {
              w1_master[j] = w1[j];

              grad_master[j] = gr[j];
              actset_indcat_master[j] = actset_indcat[j];
            }

            for (int j = 0; j < n; j++) Xb_master[j] = Xb[j];
          }
        }
        for(int j=0;j<actset_idx.size();j++)
        {
          int w_idx = actset_idx[j];
          x[cnz] = w1[w_idx];
          row_idx[cnz] = i*d+w_idx;
          cnz++;
          //cout<<cnz<<"    ";
        }
        double tal = 0;
        Eigen::MatrixXd temp;
        temp.resize(n, 1);
        for(int j = 0; j < n; j++)
        {
          temp(j, 0) = 0;
          for(int k = 0; k < d; k++)
            temp(j, 0) += X.matrix()(j, k)*w1[k];
          temp(j, 0) = Y[j] - temp(j, 0);
        }
        //temp = Y.matrix() - X.matrix().transpose()*w1.matrix();
        for(int j = 0; j < n; j++)
          tal += temp(j, 0)*temp(j, 0);
        tal = sqrt(tal)/sqrt(n);

        tmp_icov = tmp_icov_p[i];
        tmp_icov(m, m) = pow(tal, -2);
        for(int j = 0; j < m; j++)
          tmp_icov(j, m) = -tmp_icov(m, m)*w1[j];
        for(int j = m+1; j < d; j++)
          tmp_icov(j, m) = -tmp_icov(m, m)*w1[j];
        tmp_icov_p[i] = tmp_icov;
      }
      col_cnz[m+1]=cnz;
    }
    for(int i = 0; i < nlambda; i++)
      tmp_icov_p[i] = (tmp_icov_p[i].transpose()+tmp_icov_p[i])/2;
	  return List::create(
	    _["col_cnz"] = col_cnz,
	    _["row_idx"] = row_idx,
	    _["x"] = x,
	    _["icov"] = tmp_icov_p
	  );
}
int main()
{
	//initialisation
	Eigen::ArrayXXd R = Eigen::ArrayXXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); //matrix R, dimension: Nf x Nw
	std::default_random_engine generator;
	std::normal_distribution<double> distr(0.0, 0.1);
	//initialise the matrix R with small, random, non-zero numbers
	for (int i = 0; i < R.rows(); i++) {
		for (int j = 0; j < R.cols(); j++) {
			double d = distr(generator);
			//avoid having 0's as the weight, try to get all the coefficients initialised with small numbers
			while (d == 0.0) {
				d = distr(generator);
			}
			R(i, j) = d;
		}
	}
		//initialise the matrix Q, of size feature_size x (no_of_postags * no_of_distance) 
	Eigen::ArrayXXd Q = Eigen::ArrayXXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	for (int i = 0; i < Q.rows(); i++) {
		for (int j = 0; j < Q.cols(); j++) {
			double d = distr(generator);
			//avoid having 0's as the weight, try to get all the coefficients initialised with small numbers
			while (d == 0.0) {
				d = distr(generator);
			}
			Q(i, j) = d;
		}
	}
	//create an array of Ci (where each Ci is a matrix of Nf x Nf that governs the interaction between the ith feature and vn)
	std::vector<Eigen::ArrayXXd> Ci;
	for (int i = 0; i < 5; i++) {
		Eigen::ArrayXXd Ci_temp = Eigen::ArrayXXd::Zero(FEATURE_SIZE, FEATURE_SIZE);
		std::default_random_engine generator;
		std::normal_distribution<double> distr(0.0, 0.1);
		for (int j = 0; j < Ci_temp.rows(); j++) {
			for (int k = 0; k < Ci_temp.cols(); k++) {
				double d = distr(generator);
				//avoid having 0's as the weight, try to get all the coefficients initialised with small numbers
				while (d == 0.0) {
					d = distr(generator);
				}
				Ci_temp(j, k) = d;
			}
		}
		Ci.push_back(Ci_temp);
	}
	//initialise vi. We use a context of 5 (the postag of the head itself, the postag before the modifier, the postag after the modifier, the postag before the head, and the postag after the head)
	//initialise vn. Both vi and vn are column vectors with all 0's and a single 1, which indicates the POS-tag we desire to get
	//FOR NOW, we ignore the term br^transpose * R^transpose*vn, but find out what it is
	//initialise the bias term bv
	Eigen::ArrayXd bv;
	//print the array of biases into an external file
	std::ofstream final_postag_bias("final_postag_bias.txt"); 
	if (!final_postag_bias.is_open()) {
		std::cout << "fail to create final postag bias file" << std::endl;
	} 
	/*std::map<std::string, int> big_pos_tag_map = get_postag_bias(bv, "distance_count.txt", "fine_pos_tags.txt");
	std::cout << "size of big pos tag map = " << big_pos_tag_map.size() << std::endl;
	for (std::map<std::string, int>::iterator it = big_pos_tag_map.begin(); it != big_pos_tag_map.end(); it++) {
		final_postag_bias << "Pos tag " << it->first << " has bias value " << bv(it->second) << std::endl;
	} */

	/* create a pair between each "target pos tag" with a bias value (tested and correct)*/
	//Eigen::ArrayXd word_bias_pair = Eigen::ArrayXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE);
	int i_3 = -1;
	std::ifstream fine_pos_tags("fine_pos_tags.txt");
	std::string temp_string;
	/*
	while (getline(fine_pos_tags, temp_string)) {
		for (int i = -MAX_DISTANCE; i <= MAX_DISTANCE; i++) {
			if (i != 0) {
				if (i == -49 || i == -45 || i == -40 || i == -35 || i == -30 || i == -25 || i == -20 || i == -15 || i == -10 || i == -5 || i == -4 || i == -3 || i == -2 || i == -1 || i == 1 || i == 2 || i == 3 || i == 4 || i == 5 || i == 6 || i == 11 || i == 16 || i == 21 || i == 26 || i == 31 || i == 36|| i == 41 || i == 46) {
					i_3++;
					std::stringstream sstm;
					sstm << temp_string << i;
					if (big_pos_tag_map.find(sstm.str()) == big_pos_tag_map.end()) {
						std::cout << "ERROR" << std::endl;
					} else {
						int pos_tag_idx = big_pos_tag_map[sstm.str()];
						//word_bias_pair(i_3) = bv(pos_tag_idx);
					}
				}
			}
		}
	} */
	/* end */
	//main training part
	std::map<std::string, int> mapping = postag_mapping("fine_pos_tags.txt");
	fine_pos_tags.close();
	//print out both the small and big mapping
	//std::ofstream big_pos_tag_mapping("big_pos_tag_map.txt");
	std::ofstream small_pos_tag_mapping("small_pos_tag_map.txt");
	/*
	for (std::map<std::string, int>::iterator it = big_pos_tag_map.begin(); it != big_pos_tag_map.end(); it++) {
		big_pos_tag_mapping << it->first << " " << it->second << std::endl;
	} */
	for (std::map<std::string, int>::iterator it = mapping.begin(); it != mapping.end(); it++) {
		small_pos_tag_mapping << it->first << " " << it->second << std::endl;
	}
	small_pos_tag_mapping.close();

	//create the pairs
	std::ifstream pairs ("real_pairs_medium.txt");
	//initialise the gradient matrix
	std::vector<Eigen::MatrixXd> dCi;
	for (int i = 0; i < 5; i++) {
		Eigen::MatrixXd temp = Eigen::MatrixXd::Zero(FEATURE_SIZE, FEATURE_SIZE);
		dCi.push_back(temp);
	}
	Eigen::MatrixXd dR = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::MatrixXd dQ = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	std::ofstream parameters ("parameters_each_epoch_small_trial_1_check_R.txt");
	if (!parameters.is_open()) {
		std::cout << "Error: output file not opening" << std::endl;
	}
	
	//initialise all the temporary variables outside the loop to save time on memory allocation on the heap (expensive)
	//all these temporary variables are reset to zero using .setZero() method at the end of each loop
	Eigen::MatrixXd tempC = Eigen::MatrixXd::Zero(FEATURE_SIZE, FEATURE_SIZE);
	Eigen::MatrixXd temp_R_Model = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::MatrixXd temp_R_Data = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::MatrixXd temp_Q = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::MatrixXd temp_Q_Model = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::MatrixXd temp_Q_Data = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS);
	Eigen::VectorXd temp_Qw = Eigen::VectorXd::Zero(FEATURE_SIZE); 
	//Eigen::VectorXd dbias = Eigen::VectorXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE);
	//Eigen::VectorXd temp_dbias = Eigen::VectorXd::Zero(NO_OF_POSTAGS_WITHOUT_DUMMY * NO_OF_DISTANCE);
	Eigen::VectorXd tempCr = Eigen::VectorXd::Zero(FEATURE_SIZE);
	Eigen::ArrayXd temp_e = Eigen::ArrayXd::Zero(NO_OF_POSTAGS);
	/*
	Eigen::MatrixXd temp_matrix = Eigen::MatrixXd::Zero(NO_OF_POSTAGS, FEATURE_SIZE);
	Eigen::MatrixXd temp_matrix_2 = Eigen::MatrixXd::Zero(NO_OF_POSTAGS, FEATURE_SIZE); */
	//calculate the normalising constant for the first time
	std::ofstream gradient_check_R("gradient_check_R_big.txt");
	Eigen::MatrixXd curr_dR = Eigen::MatrixXd::Zero(FEATURE_SIZE, NO_OF_POSTAGS); //DELETE THIS LATER
	int no_of_iter = 0;
	for (int i = 0; i < NO_OF_EPOCH; i++) { 
		if (!pairs.is_open()) {
			std::cout << "Fail to open input file" << std::endl;
		} else {
			std::vector<std::string> temp;
			std::string line;
			while (getline(pairs, line)) {
				if (line.size() > 1) {
					no_of_iter++;
					temp = split(line, ' ');
					//initialise an array with the elements as the indices of: postag of the modifier, postag of the head, postag of before modifier, postag of after modifier, postag of before head, postag of after head
					int arr_temp[6] = {mapping[temp[0]], mapping[temp[1]], mapping[temp[2]], mapping[temp[3]], mapping[temp[4]], mapping[temp[5]]};
					if (no_of_iter % 10000 == 0) {
						std::cout << no_of_iter << std::endl;
					}
					//calculate the normalising constant
					double normalising_constant = 0.0;
					for (int iter2 = 0; iter2 < 5; iter2++) {
							tempCr += (Ci[iter2].matrix() * R.col(arr_temp[iter2+1]).matrix()); //prediction
					}
					for (int iter = 0; iter < Q.cols(); iter++) {
						double temp = exp((tempCr.transpose() * Q.col(iter).matrix()));//dot product
						temp_e(iter) = temp; 
						normalising_constant += temp; 
					} 					
					//calculate all the gradient changes for each of the Ci, from dCi[0] to dCi[4]
					//this operation is already efficient
					for (int j = 0; j < 5; j++) {
						for (int k = 0; k < Q.cols(); k++) {
							tempC += (temp_e(k) * (Q.col(k).matrix() * R.col(arr_temp[j+1]).transpose().matrix()));
						}
						//normalise
						tempC /= normalising_constant;
						dCi[j] += (Q.col(arr_temp[0]).matrix() * R.col(arr_temp[j+1]).transpose().matrix() - tempC);
						//reset the matrix tempC to prepare to calculate the next dCi[j]
						tempC.setZero();
					}
					
					//calculate dQ VERIFY
					for (int j = 0; j < 5; j++) {
						temp_Qw += (Ci[j].matrix() * R.col(arr_temp[j+1]).matrix());
					}
					for (int j = 0; j < Q.cols(); j++) {
						temp_Q_Model.col(j) += (temp_e(j) * temp_Qw);
					}
					temp_Q_Model /= normalising_constant;
					temp_Q_Data.col(arr_temp[0]) = temp_Qw;
					dQ += (temp_Q_Data - temp_Q_Model);
					temp_Qw.setZero();
					temp_Q_Model.setZero();
					temp_Q_Data.setZero();

					//calculate dR
					//calculate dR with respect to data
					for (int j = 0; j < 5; j++) {
						temp_R_Data.col(arr_temp[j+1]) += Ci[j].transpose().matrix() * Q.col(arr_temp[0]).matrix();
					}
					//calculate dR with respect to model
					for (int k = 0; k < Q.cols(); k++) {
						for (int j = 0; j < 5; j++) {
							temp_R_Model.col(arr_temp[j+1]) += (temp_e(k) * (Ci[j].transpose().matrix() * Q.col(k).matrix()));
						}
					}
					temp_R_Model /= normalising_constant;
					dR += (temp_R_Data - temp_R_Model);
					if (no_of_iter  % 1000 == 0) {
						curr_dR = temp_R_Data - temp_R_Model;
					}
					temp_R_Data.setZero();
					temp_R_Model.setZero();

					//calculate dbias
					//temp_dbias(arr_temp[0]) = 1.0;
					//dbias += (temp_dbias - temp_e.matrix() / normalising_constant);
					//temp_dbias(arr_temp[0]) = 0.0;

					tempCr.setZero();
					temp_e.setZero();

					if (no_of_iter % 1000 == 0) {
							Eigen::VectorXd temp = Eigen::VectorXd::Zero(FEATURE_SIZE);
							gradient_check_R <<  "current data point = " << no_of_iter <<std::endl;
								for (int j = 0; j < R.rows(); j++) {
									for (int k = 0; k < R.cols(); k++) {
										Eigen::ArrayXXd R_copy = R;
										double result_1 = 0.0;
										double result_2 = 0.0;
										double temp_double = 0.0;
										gradient_check_R << curr_dR(j, k) << " ";
										R_copy(j, k) += EPSILON;
										for (int iter2 = 0; iter2 < 5; iter2++) {
												temp += (Ci[iter2].matrix() * R_copy.col(arr_temp[iter2+1]).matrix());
										}
										result_1 += temp.transpose() * Q.col(arr_temp[0]).matrix();
										for (int l = 0; l < Q.cols(); l++) {
											temp_double += exp(temp.transpose() * Q.col(l).matrix());
										}
										result_1 -= log(temp_double);
										temp_double = 0.0;
										//calculate result_2
										R_copy(j, k) -= (2 * EPSILON);
										temp.setZero();
										for (int iter2 = 0; iter2 < 5; iter2++) {
												temp += (Ci[iter2].matrix() * R_copy.col(arr_temp[iter2 + 1]).matrix());	
										}
										result_2 += temp.transpose() * Q.col(arr_temp[0]).matrix();
										for (int l = 0; l < Q.cols(); l++) {
											temp_double += exp(temp.transpose() * Q.col(l).matrix());
										}
										result_2 -= log(temp_double);
										temp.setZero();
										gradient_check_R << (result_1 - result_2) / (2 * EPSILON) << std::endl;

									}
							}
							curr_dR.setZero();
				}

					//update the gradient if no_of_iter mod mini batch size == 0, reset the gradient matrix to 0

					if (no_of_iter % BATCH_SIZE == 0) {
						double curr_learning_rate = LEARNING_RATE / (1 + no_of_iter * LEARNING_DECAY);
						for (int j = 0; j < 5; j++) {
							Ci[j] = Ci[j].matrix() + curr_learning_rate * dCi[j];
							//reset the matrix dCi[j]
							dCi[j].setZero();
						}
						//word_bias_pair = word_bias_pair + curr_learning_rate * dbias.array();
						R = R.matrix() + curr_learning_rate * dR;
						Q = Q.matrix() + curr_learning_rate * dQ;
						dR.setZero();
						dQ.setZero();
						//dbias.setZero();
					} 
				} 
				temp.clear();
			}
			pairs.close();
		}
		parameters << "Epoch number: " << i+1 << std::endl;
		for (int j = 0; j < 5; j++) {
			parameters << "Matrix C" << j+1 << std::endl; 
			parameters << Ci[j] << std::endl;
			parameters << std::endl;
			parameters << "End of matrix C" << j+1 << std::endl;
		}
		parameters << "Matrix R" << std::endl;
		parameters << R << std::endl;
		parameters << "End of matrix R" << std::endl;
		parameters << "Matrix Q" << std::endl;
		parameters << Q << std::endl;
		parameters << "End of matrix Q" << std::endl;
		//parameters << "Bias vector" << std::endl;
		//parameters << word_bias_pair << std::endl;
		//parameters << "End of bias vector" << std::endl;
		pairs.open("real_pairs_medium.txt");
	} 
	pairs.close();
	parameters.close();
	return 0;
}