void Labeler::train(const string& trainFile, const string& devFile, const string& testFile, 
    const string& modelFile, const string& optionFile, const string& wordEmbFile) {
  if (optionFile != "")
    m_options.load(optionFile);
  m_options.showOptions();
  vector<Instance> trainInsts, devInsts, testInsts;
  static vector<Instance> decodeInstResults;
  static Instance curDecodeInst;
  bool bCurIterBetter = false;

  m_pipe.readInstances(trainFile, trainInsts, m_options.maxInstance);
  if (devFile != "")
    m_pipe.readInstances(devFile, devInsts, m_options.maxInstance);
  if (testFile != "")
    m_pipe.readInstances(testFile, testInsts, m_options.maxInstance);

  //Ensure that each file in m_options.testFiles exists!
  vector<vector<Instance> > otherInsts(m_options.testFiles.size());
  for (int idx = 0; idx < m_options.testFiles.size(); idx++) {
    m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_options.maxInstance);
  }

  //std::cout << "Training example number: " << trainInsts.size() << std::endl;
  //std::cout << "Dev example number: " << trainInsts.size() << std::endl;
  //std::cout << "Test example number: " << trainInsts.size() << std::endl;

  createAlphabet(trainInsts);

  if (!m_options.wordEmbFineTune) {
    addTestWordAlpha(devInsts);
    addTestWordAlpha(testInsts);
    for (int idx = 0; idx < otherInsts.size(); idx++) {
      addTestWordAlpha(otherInsts[idx]);
    }
    cout << "Remain words num: " << m_wordAlphabet.size() << endl;
  }

  NRMat<dtype> wordEmb;
  if (wordEmbFile != "") {
    readWordEmbeddings(wordEmbFile, wordEmb);
  } else {
    wordEmb.resize(m_wordAlphabet.size(), m_options.wordEmbSize);
    wordEmb.randu(1000);
  }

  NRVec<NRMat<dtype> > tagEmbs(m_tagAlphabets.size());
  for (int idx = 0; idx < tagEmbs.size(); idx++) {
    tagEmbs[idx].resize(m_tagAlphabets[idx].size(), m_options.tagEmbSize);
    tagEmbs[idx].randu(1002 + idx);
  }

  m_classifier.init(m_labelAlphabet.size(), m_featAlphabet.size());

  m_classifier.setDropValue(m_options.dropProb);

  vector<Example> trainExamples, devExamples, testExamples;
  initialExamples(trainInsts, trainExamples);
  initialExamples(devInsts, devExamples);
  initialExamples(testInsts, testExamples);

  vector<int> otherInstNums(otherInsts.size());
  vector<vector<Example> > otherExamples(otherInsts.size());
  for (int idx = 0; idx < otherInsts.size(); idx++) {
    initialExamples(otherInsts[idx], otherExamples[idx]);
    otherInstNums[idx] = otherExamples[idx].size();
  }

  dtype bestDIS = 0;

  int inputSize = trainExamples.size();

  int batchBlock = inputSize / m_options.batchSize;
  if (inputSize % m_options.batchSize != 0)
    batchBlock++;

  srand(0);
  std::vector<int> indexes;
  for (int i = 0; i < inputSize; ++i)
    indexes.push_back(i);

  static Metric eval, metric_dev, metric_test;
  static vector<Example> subExamples;
  int devNum = devExamples.size(), testNum = testExamples.size();
  for (int iter = 0; iter < m_options.maxIter; ++iter) {
    std::cout << "##### Iteration " << iter << std::endl;

    random_shuffle(indexes.begin(), indexes.end());
    eval.reset();
    for (int updateIter = 0; updateIter < batchBlock; updateIter++) {
      subExamples.clear();
      int start_pos = updateIter * m_options.batchSize;
      int end_pos = (updateIter + 1) * m_options.batchSize;
      if (end_pos > inputSize)
        end_pos = inputSize;

      for (int idy = start_pos; idy < end_pos; idy++) {
        subExamples.push_back(trainExamples[indexes[idy]]);
      }

      int curUpdateIter = iter * batchBlock + updateIter;
      dtype cost = m_classifier.process(subExamples, curUpdateIter);

      eval.overall_label_count += m_classifier._eval.overall_label_count;
      eval.correct_label_count += m_classifier._eval.correct_label_count;

      if ((curUpdateIter + 1) % m_options.verboseIter == 0) {
        //m_classifier.checkgrads(subExamples, curUpdateIter+1);
        std::cout << "current: " << updateIter + 1 << ", total block: " << batchBlock << std::endl;
        std::cout << "Cost = " << cost << ", Tag Correct(%) = " << eval.getAccuracy() << std::endl;
      }
      m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps);

    }

    if (devNum > 0) {
      bCurIterBetter = false;
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_dev.reset();
      for (int idx = 0; idx < devExamples.size(); idx++) {
        vector<string> result_labels;
        predict(devExamples[idx].m_features, result_labels, devInsts[idx].words);

        if (m_options.seg)
          devInsts[idx].SegEvaluate(result_labels, metric_dev);
        else
          devInsts[idx].Evaluate(result_labels, metric_dev);

        if (!m_options.outBest.empty()) {
          curDecodeInst.copyValuesFrom(devInsts[idx]);
          curDecodeInst.assignLabel(result_labels);
          decodeInstResults.push_back(curDecodeInst);
        }
      }

      metric_dev.print();

      if (!m_options.outBest.empty() && metric_dev.getAccuracy() > bestDIS) {
        m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults);
        bCurIterBetter = true;
      }

      if (testNum > 0) {
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idx = 0; idx < testExamples.size(); idx++) {
          vector<string> result_labels;
          predict(testExamples[idx].m_features, result_labels, testInsts[idx].words);

          if (m_options.seg)
            testInsts[idx].SegEvaluate(result_labels, metric_test);
          else
            testInsts[idx].Evaluate(result_labels, metric_test);

          if (bCurIterBetter && !m_options.outBest.empty()) {
            curDecodeInst.copyValuesFrom(testInsts[idx]);
            curDecodeInst.assignLabel(result_labels);
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults);
        }
      }

      for (int idx = 0; idx < otherExamples.size(); idx++) {
        std::cout << "processing " << m_options.testFiles[idx] << std::endl;
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idy = 0; idy < otherExamples[idx].size(); idy++) {
          vector<string> result_labels;
          predict(otherExamples[idx][idy].m_features, result_labels, otherInsts[idx][idy].words);

          if (m_options.seg)
            otherInsts[idx][idy].SegEvaluate(result_labels, metric_test);
          else
            otherInsts[idx][idy].Evaluate(result_labels, metric_test);

          if (bCurIterBetter && !m_options.outBest.empty()) {
            curDecodeInst.copyValuesFrom(otherInsts[idx][idy]);
            curDecodeInst.assignLabel(result_labels);
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults);
        }
      }

      if (m_options.saveIntermediate && metric_dev.getAccuracy() > bestDIS) {
        std::cout << "Exceeds best previous performance of " << bestDIS << ". Saving model file.." << std::endl;
        bestDIS = metric_dev.getAccuracy();
        writeModelFile(modelFile);
      }

    }
    // Clear gradients
  }
}
void Segmentor::train(const string& trainFile, const string& devFile, const string& testFile, const string& modelFile, const string& optionFile,
    const string& wordEmbFile, const string& charEmbFile, const string& bicharEmbFile) {
  if (optionFile != "")
    m_options.load(optionFile);

  m_options.showOptions();
  vector<Instance> trainInsts, devInsts, testInsts;
  m_pipe.readInstances(trainFile, trainInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance);
  if (devFile != "")
    m_pipe.readInstances(devFile, devInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance);
  if (testFile != "")
    m_pipe.readInstances(testFile, testInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance);

  vector<vector<Instance> > otherInsts(m_options.testFiles.size());
  for (int idx = 0; idx < m_options.testFiles.size(); idx++) {
    m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance);
  }

  createAlphabet(trainInsts);

  addTestWordAlpha(devInsts);
  addTestWordAlpha(testInsts);
    
  NRMat<dtype> wordEmb, allwordEmb;
  if (wordEmbFile != "") {  	
  	allWordAlphaEmb(wordEmbFile, allwordEmb);
  } else {
  	std::cout << "must not be here, allword must be pretrained." << std::endl;
  }
  wordEmb.resize(m_classifier.fe._wordAlphabet.size(), m_options.wordEmbSize);
  wordEmb.randu(1000);

  cout << "word emb dim is " << wordEmb.ncols() << std::endl;

  NRMat<dtype> charEmb;
  if (charEmbFile != "") {
  	readEmbeddings(m_classifier.fe._charAlphabet, charEmbFile, charEmb);
  } else {
  	charEmb.resize(m_classifier.fe._charAlphabet.size(), m_options.charEmbSize);
  	charEmb.randu(2000);
  }

  cout << "char emb dim is " << charEmb.ncols() << std::endl;

  NRMat<dtype> bicharEmb;
  if (bicharEmbFile != "") {
  	readEmbeddings(m_classifier.fe._bicharAlphabet, bicharEmbFile, bicharEmb);
  } else {
  	bicharEmb.resize(m_classifier.fe._bicharAlphabet.size(), m_options.bicharEmbSize);
  	bicharEmb.randu(2000);
  }

  cout << "bichar emb dim is " << bicharEmb.ncols() << std::endl;

  NRMat<dtype> actionEmb;
  actionEmb.resize(m_classifier.fe._actionAlphabet.size(), m_options.actionEmbSize);
  actionEmb.randu(3000);

  cout << "action emb dim is " << actionEmb.ncols() << std::endl;

  NRMat<dtype> lengthEmb;
  lengthEmb.resize(6, m_options.lengthEmbSize);
  lengthEmb.randu(3000);

  cout << "length emb dim is " << actionEmb.ncols() << std::endl;


  m_classifier.init(wordEmb, allwordEmb, lengthEmb, m_options.wordNgram, m_options.wordHiddenSize, m_options.wordRNNHiddenSize,
			charEmb, bicharEmb, m_options.charcontext, m_options.charHiddenSize, m_options.charRNNHiddenSize,
			actionEmb, m_options.actionNgram, m_options.actionHiddenSize, m_options.actionRNNHiddenSize,
			m_options.sepHiddenSize, m_options.appHiddenSize, m_options.delta);

  m_classifier.setDropValue(m_options.dropProb);

  m_classifier.setOOVFreq(m_options.wordCutOff);
  m_classifier.setOOVRatio(m_options.oovRatio);
  m_classifier.setWordFreq(m_word_stat);

  vector<vector<CAction> > trainInstGoldactions;
  getGoldActions(trainInsts, trainInstGoldactions);
  double bestFmeasure = 0;

  int inputSize = trainInsts.size();

  std::vector<int> indexes;
  for (int i = 0; i < inputSize; ++i)
    indexes.push_back(i);

  static Metric eval, metric_dev, metric_test;

  int maxIter = m_options.maxIter * (inputSize / m_options.batchSize + 1);
  int oneIterMaxRound = (inputSize + m_options.batchSize -1) / m_options.batchSize;
  std::cout << "maxIter = " << maxIter << std::endl;
  int devNum = devInsts.size(), testNum = testInsts.size();

  static vector<vector<string> > decodeInstResults;
  static vector<string> curDecodeInst;
  static bool bCurIterBetter;
  static vector<vector<string> > subInstances;
  static vector<vector<CAction> > subInstGoldActions;

  for (int iter = 0; iter < maxIter; ++iter) {
    std::cout << "##### Iteration " << iter << std::endl;
    srand(iter);
    random_shuffle(indexes.begin(), indexes.end());
    std::cout << "random: " << indexes[0] << ", " << indexes[indexes.size() - 1] << std::endl;    
    bool bEvaluate = false;
    if(m_options.batchSize == 1){
    	eval.reset();
	    bEvaluate = true;
	    for (int idy = 0; idy < inputSize; idy++) {
	      subInstances.clear();
	      subInstGoldActions.clear();
	      subInstances.push_back(trainInsts[indexes[idy]].chars);
	      subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]);
	      	
	      double cost = m_classifier.train(subInstances, subInstGoldActions);
	
	      eval.overall_label_count += m_classifier._eval.overall_label_count;
	      eval.correct_label_count += m_classifier._eval.correct_label_count;
	
	      if ((idy + 1) % (m_options.verboseIter*10) == 0) {
	        std::cout << "current: " << idy + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl;
	      }
	      m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps, m_options.clip);
	    }
	    std::cout << "current: " << iter + 1  << ", Correct(%) = " << eval.getAccuracy() << std::endl;
    }
    else{
    	if(iter == 0)eval.reset();
  		subInstances.clear();
      subInstGoldActions.clear();
    	for (int idy = 0; idy < m_options.batchSize; idy++) {
	      subInstances.push_back(trainInsts[indexes[idy]].chars);
	      subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]);    		
    	}
      double cost = m_classifier.train(subInstances, subInstGoldActions);

      eval.overall_label_count += m_classifier._eval.overall_label_count;
      eval.correct_label_count += m_classifier._eval.correct_label_count;
      
      if ((iter + 1) % (m_options.verboseIter) == 0) {
      	std::cout << "current: " << iter + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl;
      	eval.reset();
      	bEvaluate = true;
      }
      
      m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps, m_options.clip);
    }
    
    if (bEvaluate && devNum > 0) {
      bCurIterBetter = false;
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_dev.reset();
      for (int idx = 0; idx < devInsts.size(); idx++) {
        predict(devInsts[idx], curDecodeInst);
        devInsts[idx].evaluate(curDecodeInst, metric_dev);
        if (!m_options.outBest.empty()) {
          decodeInstResults.push_back(curDecodeInst);
        }
      }
      std::cout << "dev:" << std::endl;
      metric_dev.print();

      if (!m_options.outBest.empty() && metric_dev.getAccuracy() > bestFmeasure) {
        m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults);
        bCurIterBetter = true;
      }

      if (testNum > 0) {
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idx = 0; idx < testInsts.size(); idx++) {
          predict(testInsts[idx], curDecodeInst);
          testInsts[idx].evaluate(curDecodeInst, metric_test);
          if (bCurIterBetter && !m_options.outBest.empty()) {
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults);
        }
      }

      for (int idx = 0; idx < otherInsts.size(); idx++) {
        std::cout << "processing " << m_options.testFiles[idx] << std::endl;
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idy = 0; idy < otherInsts[idx].size(); idy++) {
          predict(otherInsts[idx][idy], curDecodeInst);
          otherInsts[idx][idy].evaluate(curDecodeInst, metric_test);
          if (bCurIterBetter && !m_options.outBest.empty()) {
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults);
        }
      }


      if (m_options.saveIntermediate && metric_dev.getAccuracy() > bestFmeasure) {
        std::cout << "Exceeds best previous DIS of " << bestFmeasure << ". Saving model file.." << std::endl;
        bestFmeasure = metric_dev.getAccuracy();
        writeModelFile(modelFile);
      }
    }
  }
}
void Segmentor::train(const string& trainFile, const string& devFile, const string& testFile, const string& modelFile, const string& optionFile,
    const string& wordEmbFile) {
  if (optionFile != "")
    m_options.load(optionFile);

  m_options.showOptions();
  vector<Instance> trainInsts, devInsts, testInsts;
  m_pipe.readInstances(trainFile, trainInsts, m_classifier.MAX_SENTENCE_SIZE, m_options.maxInstance);
  if (devFile != "")
    m_pipe.readInstances(devFile, devInsts, m_classifier.MAX_SENTENCE_SIZE, m_options.maxInstance);
  if (testFile != "")
    m_pipe.readInstances(testFile, testInsts, m_classifier.MAX_SENTENCE_SIZE, m_options.maxInstance);

  vector<vector<Instance> > otherInsts(m_options.testFiles.size());
  for (int idx = 0; idx < m_options.testFiles.size(); idx++) {
    m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_classifier.MAX_SENTENCE_SIZE, m_options.maxInstance);
  }

  createAlphabet(trainInsts);

  addTestWordAlpha(devInsts);
  addTestWordAlpha(testInsts);
  for (int idx = 0; idx < otherInsts.size(); idx++) {
    addTestWordAlpha(otherInsts[idx]);
  }


  m_classifier.init();
  m_classifier.setDropValue(m_options.dropProb);

  vector<vector<CAction> > trainInstGoldactions;
  getGoldActions(trainInsts, trainInstGoldactions);
  double bestFmeasure = 0;

  int inputSize = trainInsts.size();

  std::vector<int> indexes;
  for (int i = 0; i < inputSize; ++i)
    indexes.push_back(i);

  static Metric eval, metric_dev, metric_test;

  int maxIter = m_options.maxIter * (inputSize / m_options.batchSize + 1);
  int oneIterMaxRound = (inputSize + m_options.batchSize -1) / m_options.batchSize;
  std::cout << "maxIter = " << maxIter << std::endl;
  int devNum = devInsts.size(), testNum = testInsts.size();

  static vector<vector<string> > decodeInstResults;
  static vector<string> curDecodeInst;
  static bool bCurIterBetter;
  static vector<vector<string> > subInstances;
  static vector<vector<CAction> > subInstGoldActions;

  //m_classifier.setAlphaIncreasing(true);
  for (int iter = 0; iter < maxIter; ++iter) {
    std::cout << "##### Iteration " << iter << std::endl;
    srand(iter);
    //random_shuffle(indexes.begin(), indexes.end());
    std::cout << "random: " << indexes[0] << ", " << indexes[indexes.size() - 1] << std::endl;
    eval.reset();
    for (int updateIter = 0; updateIter < oneIterMaxRound; updateIter++) {
      int start_pos = updateIter * m_options.batchSize;
      int end_pos = (updateIter + 1) * m_options.batchSize;
      if (end_pos > inputSize)
      end_pos = inputSize;
      subInstances.clear();
      subInstGoldActions.clear();
      for (int idy = start_pos; idy < end_pos; idy++) {
        subInstances.push_back(trainInsts[indexes[idy]].chars);
        subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]);
      }

      double cost = m_classifier.train(subInstances, subInstGoldActions);

      eval.overall_label_count += m_classifier._eval.overall_label_count;
      eval.correct_label_count += m_classifier._eval.correct_label_count;

      //if ((updateIter + 1) % (m_options.verboseIter*10) == 0) {
        //std::cout << "current: " << updateIter + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl;
      //}
      m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps);
    }

    std::cout << "current: " << iter + 1  << ", Correct(%) = " << eval.getAccuracy() << std::endl;

    if (devNum > 0) {
      bCurIterBetter = false;
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_dev.reset();
      for (int idx = 0; idx < devInsts.size(); idx++) {
        predict(devInsts[idx], curDecodeInst);
        devInsts[idx].evaluate(curDecodeInst, metric_dev);
        if (!m_options.outBest.empty()) {
          decodeInstResults.push_back(curDecodeInst);
        }
      }
      std::cout << "dev:" << std::endl;
      metric_dev.print();

      if (!m_options.outBest.empty() && metric_dev.getAccuracy() > bestFmeasure) {
        m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults);
        bCurIterBetter = true;
      }

      if (testNum > 0) {
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idx = 0; idx < testInsts.size(); idx++) {
          predict(testInsts[idx], curDecodeInst);
          testInsts[idx].evaluate(curDecodeInst, metric_test);
          if (bCurIterBetter && !m_options.outBest.empty()) {
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults);
        }
      }

      for (int idx = 0; idx < otherInsts.size(); idx++) {
        std::cout << "processing " << m_options.testFiles[idx] << std::endl;
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idy = 0; idy < otherInsts[idx].size(); idy++) {
          predict(otherInsts[idx][idy], curDecodeInst);
          otherInsts[idx][idy].evaluate(curDecodeInst, metric_test);
          if (bCurIterBetter && !m_options.outBest.empty()) {
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if (!m_options.outBest.empty() && bCurIterBetter) {
          m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults);
        }
      }


      if (m_options.saveIntermediate && metric_dev.getAccuracy() > bestFmeasure) {
        std::cout << "Exceeds best previous DIS of " << bestFmeasure << ". Saving model file.." << std::endl;
        bestFmeasure = metric_dev.getAccuracy();
        writeModelFile(modelFile);
      }
    }
  }
}
void Labeler::train(const string& trainFile, const string& devFile, const string& testFile, const string& modelFile, const string& optionFile,
    const string& wordEmbFile, const string& charEmbFile) {
  if (optionFile != "")
    m_options.load(optionFile);

  m_options.showOptions();

  m_linearfeat = 0;

  vector<Instance> trainInsts, devInsts, testInsts;
  static vector<Instance> decodeInstResults;
  static Instance curDecodeInst;
  bool bCurIterBetter = false;

  m_pipe.readInstances(trainFile, trainInsts, m_options.maxInstance);
  if (devFile != "")
    m_pipe.readInstances(devFile, devInsts, m_options.maxInstance);
  if (testFile != "")
    m_pipe.readInstances(testFile, testInsts, m_options.maxInstance);

  //Ensure that each file in m_options.testFiles exists!
  vector<vector<Instance> > otherInsts(m_options.testFiles.size());
  for (int idx = 0; idx < m_options.testFiles.size(); idx++) {
    m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_options.maxInstance);
  }

  //std::cout << "Training example number: " << trainInsts.size() << std::endl;
  //std::cout << "Dev example number: " << trainInsts.size() << std::endl;
  //std::cout << "Test example number: " << trainInsts.size() << std::endl;

  createAlphabet(trainInsts);

  if (!m_options.wordEmbFineTune) {
    addTestWordAlpha(devInsts);
    addTestWordAlpha(testInsts);
    for (int idx = 0; idx < otherInsts.size(); idx++) {
      addTestWordAlpha(otherInsts[idx]);
    }
    cout << "Remain words num: " << m_textWordAlphabet.size() << endl;
  }

  if (!m_options.charEmbFineTune) {
    addTestCharAlpha(devInsts);
    addTestCharAlpha(testInsts);
    for (int idx = 0; idx < otherInsts.size(); idx++) {
      addTestCharAlpha(otherInsts[idx]);
    }
    cout << "Remain char num: " << m_charAlphabet.size() << endl;
  }

  NRMat<double> wordEmb;
  if (wordEmbFile != "") {
    readWordEmbeddings(wordEmbFile, wordEmb);
  } else {
    wordEmb.resize(m_textWordAlphabet.size(), m_options.wordEmbSize);
    wordEmb.randu(1000);
  }

  NRMat<double> charEmb;
  if (charEmbFile != "") {
    readWordEmbeddings(charEmbFile, charEmb);
  } else {
    charEmb.resize(m_charAlphabet.size(), m_options.charEmbSize);
    charEmb.randu(1001);
  }

  m_classifier.init(wordEmb, m_options.wordcontext, charEmb, m_options.charcontext, m_headWordAlphabet.size(), m_options.wordHiddenSize, m_options.charHiddenSize, m_options.hiddenSize);
  m_classifier.resetRemove(m_options.removePool, m_options.removeCharPool);
  m_classifier.setDropValue(m_options.dropProb);
  m_classifier.setWordEmbFinetune(m_options.wordEmbFineTune, m_options.charEmbFineTune);

  vector<Example> trainExamples, devExamples, testExamples;
  initialExamples(trainInsts, trainExamples);
  initialExamples(devInsts, devExamples);
  initialExamples(testInsts, testExamples);

  vector<int> otherInstNums(otherInsts.size());
  vector<vector<Example> > otherExamples(otherInsts.size());
  for (int idx = 0; idx < otherInsts.size(); idx++) {
    initialExamples(otherInsts[idx], otherExamples[idx]);
    otherInstNums[idx] = otherExamples[idx].size();
  }

  double bestDIS = 0;

  int inputSize = trainExamples.size();

  srand(0);
  std::vector<int> indexes;
  for (int i = 0; i < inputSize; ++i)
    indexes.push_back(i);

  static Metric eval, metric_dev, metric_test;
  static vector<Example> subExamples;
  int devNum = devExamples.size(), testNum = testExamples.size();

  int maxIter = m_options.maxIter;
  if (m_options.batchSize > 1)
    maxIter = m_options.maxIter * (inputSize / m_options.batchSize + 1);

  double cost = 0.0;
  std::cout << "maxIter = " << maxIter << std::endl;
  for (int iter = 0; iter < m_options.maxIter; ++iter) {
    std::cout << "##### Iteration " << iter << std::endl;
    eval.reset();
    if (m_options.batchSize == 1) {
      random_shuffle(indexes.begin(), indexes.end());
      for (int updateIter = 0; updateIter < inputSize; updateIter++) {
        subExamples.clear();
        int start_pos = updateIter;
        int end_pos = (updateIter + 1);
        if (end_pos > inputSize)
          end_pos = inputSize;

        for (int idy = start_pos; idy < end_pos; idy++) {
          subExamples.push_back(trainExamples[indexes[idy]]);
        }

        int curUpdateIter = iter * inputSize + updateIter;
        cost = m_classifier.process(subExamples, curUpdateIter);

        eval.overall_label_count += m_classifier._eval.overall_label_count;
        eval.correct_label_count += m_classifier._eval.correct_label_count;

        if ((curUpdateIter + 1) % m_options.verboseIter == 0) {
          //m_classifier.checkgrads(subExamples, curUpdateIter+1);
          std::cout << "current: " << updateIter + 1 << ", total instances: " << inputSize << std::endl;
          std::cout << "Cost = " << cost << ", SA Correct(%) = " << eval.getAccuracy() << std::endl;
        }
        m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps);
      }
    } else {
      cost = 0.0;
      for (int updateIter = 0; updateIter < m_options.verboseIter; updateIter++) {
        random_shuffle(indexes.begin(), indexes.end());
        subExamples.clear();
        for (int idy = 0; idy < m_options.batchSize; idy++) {
          subExamples.push_back(trainExamples[indexes[idy]]);
        }
        int curUpdateIter = iter * m_options.verboseIter + updateIter;
        cost += m_classifier.process(subExamples, curUpdateIter);
        //m_classifier.checkgrads(subExamples, curUpdateIter);
        eval.overall_label_count += m_classifier._eval.overall_label_count;
        eval.correct_label_count += m_classifier._eval.correct_label_count;

        m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps);
      }
      std::cout << "current iter: " << iter + 1 << ", total iter: " << maxIter << std::endl;
      std::cout << "Cost = " << cost << ", SA Correct(%) = " << eval.getAccuracy() << std::endl;
    }

    if (devNum > 0) {
      bCurIterBetter = false;
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_dev.reset();
      for (int idx = 0; idx < devExamples.size(); idx++) {
        string result_label;
        double confidence = predict(devExamples[idx].m_features, result_label);

        devInsts[idx].Evaluate(result_label, metric_dev);

        if (!m_options.outBest.empty()) {
          curDecodeInst.copyValuesFrom(devInsts[idx]);
          curDecodeInst.assignLabel(result_label, confidence);
          decodeInstResults.push_back(curDecodeInst);
        }
      }
      metric_dev.print();

      if ((!m_options.outBest.empty() && metric_dev.getAccuracy() > bestDIS)) {
        m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults);
        bCurIterBetter = true;
      }

      if (testNum > 0) {
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idx = 0; idx < testExamples.size(); idx++) {
          string result_label;
          double confidence = predict(testExamples[idx].m_features, result_label);
          testInsts[idx].Evaluate(result_label, metric_test);

          if (bCurIterBetter && !m_options.outBest.empty()) {
            curDecodeInst.copyValuesFrom(testInsts[idx]);
            curDecodeInst.assignLabel(result_label, confidence);
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if ((!m_options.outBest.empty() && bCurIterBetter)) {
          m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults);
        }
      }

      for (int idx = 0; idx < otherExamples.size(); idx++) {
        std::cout << "processing " << m_options.testFiles[idx] << std::endl;
        if (!m_options.outBest.empty())
          decodeInstResults.clear();
        metric_test.reset();
        for (int idy = 0; idy < otherExamples[idx].size(); idy++) {
          string result_label;
          double confidence = predict(otherExamples[idx][idy].m_features, result_label);

          otherInsts[idx][idy].Evaluate(result_label, metric_test);

          if (bCurIterBetter && !m_options.outBest.empty()) {
            curDecodeInst.copyValuesFrom(otherInsts[idx][idy]);
            curDecodeInst.assignLabel(result_label, confidence);
            decodeInstResults.push_back(curDecodeInst);
          }
        }
        std::cout << "test:" << std::endl;
        metric_test.print();

        if ((!m_options.outBest.empty() && bCurIterBetter)) {
          m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults);
        }
      }

      if ((m_options.saveIntermediate && metric_dev.getAccuracy() > bestDIS)) {
        if (metric_dev.getAccuracy() > bestDIS) {
          std::cout << "Exceeds best previous performance of " << bestDIS << ". Saving model file.." << std::endl;
          bestDIS = metric_dev.getAccuracy();
        }
        writeModelFile(modelFile);
      }

    }
    // Clear gradients
  }

  if (devNum > 0) {
    bCurIterBetter = false;
    if (!m_options.outBest.empty())
      decodeInstResults.clear();
    metric_dev.reset();
    for (int idx = 0; idx < devExamples.size(); idx++) {
      string result_label;
      double confidence = predict(devExamples[idx].m_features, result_label);

      devInsts[idx].Evaluate(result_label, metric_dev);

      if (!m_options.outBest.empty()) {
        curDecodeInst.copyValuesFrom(devInsts[idx]);
        curDecodeInst.assignLabel(result_label, confidence);
        decodeInstResults.push_back(curDecodeInst);
      }
    }
    metric_dev.print();

    if ((!m_options.outBest.empty() && metric_dev.getAccuracy() > bestDIS)) {
      m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults);
      bCurIterBetter = true;
    }

    if (testNum > 0) {
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_test.reset();
      for (int idx = 0; idx < testExamples.size(); idx++) {
        string result_label;
        double confidence = predict(testExamples[idx].m_features, result_label);
        testInsts[idx].Evaluate(result_label, metric_test);

        if (bCurIterBetter && !m_options.outBest.empty()) {
          curDecodeInst.copyValuesFrom(testInsts[idx]);
          curDecodeInst.assignLabel(result_label, confidence);
          decodeInstResults.push_back(curDecodeInst);
        }
      }
      std::cout << "test:" << std::endl;
      metric_test.print();

      if ((!m_options.outBest.empty() && bCurIterBetter)) {
        m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults);
      }
    }

    for (int idx = 0; idx < otherExamples.size(); idx++) {
      std::cout << "processing " << m_options.testFiles[idx] << std::endl;
      if (!m_options.outBest.empty())
        decodeInstResults.clear();
      metric_test.reset();
      for (int idy = 0; idy < otherExamples[idx].size(); idy++) {
        string result_label;
        double confidence = predict(otherExamples[idx][idy].m_features, result_label);

        otherInsts[idx][idy].Evaluate(result_label, metric_test);

        if (bCurIterBetter && !m_options.outBest.empty()) {
          curDecodeInst.copyValuesFrom(otherInsts[idx][idy]);
          curDecodeInst.assignLabel(result_label, confidence);
          decodeInstResults.push_back(curDecodeInst);
        }
      }
      std::cout << "test:" << std::endl;
      metric_test.print();

      if ((!m_options.outBest.empty() && bCurIterBetter)) {
        m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults);
      }
    }

    if ((m_options.saveIntermediate && metric_dev.getAccuracy() > bestDIS)) {
      if (metric_dev.getAccuracy() > bestDIS) {
        std::cout << "Exceeds best previous performance of " << bestDIS << ". Saving model file.." << std::endl;
        bestDIS = metric_dev.getAccuracy();
      }
      writeModelFile(modelFile);
    }

  } else {
    writeModelFile(modelFile);
  }
}