void Segmentor::train(const string& trainFile, const string& devFile, const string& testFile, const string& modelFile, const string& optionFile, const string& lexiconFile) { if (optionFile != "") m_options.load(optionFile); m_options.showOptions(); vector<Instance> trainInsts, devInsts, testInsts; m_pipe.readInstances(trainFile, trainInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); if (devFile != "") m_pipe.readInstances(devFile, devInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); if (testFile != "") m_pipe.readInstances(testFile, testInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); vector<vector<Instance> > otherInsts(m_options.testFiles.size()); for (int idx = 0; idx < m_options.testFiles.size(); idx++) { m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); } createAlphabet(trainInsts); m_classifier.init(m_options.delta); m_classifier.setDropValue(m_options.dropProb); vector<vector<CAction> > trainInstGoldactions; getGoldActions(trainInsts, trainInstGoldactions); double bestPostagFmeasure = 0; int inputSize = trainInsts.size(); std::vector<int> indexes; for (int i = 0; i < inputSize; ++i) indexes.push_back(i); static Metric eval; static Metric segMetric_dev, segMetric_test; static Metric postagMetric_dev, postagMetric_test; int maxIter = m_options.maxIter * (inputSize / m_options.batchSize + 1); int oneIterMaxRound = (inputSize + m_options.batchSize - 1) / m_options.batchSize; std::cout << "maxIter = " << maxIter << std::endl; int devNum = devInsts.size(), testNum = testInsts.size(); static vector<CResult> decodeInstResults; static CResult curDecodeInst; static bool bCurIterBetter; static vector<Instance > subInstances; static vector<vector<CAction> > subInstGoldActions; for (int iter = 0; iter < maxIter; ++iter) { std::cout << "##### Iteration " << iter << std::endl; srand(iter); random_shuffle(indexes.begin(), indexes.end()); std::cout << "random: " << indexes[0] << ", " << indexes[indexes.size() - 1] << std::endl; bool bEvaluate = false; if (m_options.batchSize == 1) { eval.reset(); bEvaluate = true; for (int idy = 0; idy < inputSize; idy++) { subInstances.clear(); subInstGoldActions.clear(); subInstances.push_back(trainInsts[indexes[idy]]); subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]); double cost = m_classifier.train(subInstances, subInstGoldActions); eval.overall_label_count += m_classifier._eval.overall_label_count; eval.correct_label_count += m_classifier._eval.correct_label_count; if ((idy + 1) % (m_options.verboseIter * 10) == 0) { std::cout << "current: " << idy + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl; } m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps); } std::cout << "current: " << iter + 1 << ", Correct(%) = " << eval.getAccuracy() << std::endl; } else { if (iter == 0) eval.reset(); subInstances.clear(); subInstGoldActions.clear(); for (int idy = 0; idy < m_options.batchSize; idy++) { subInstances.push_back(trainInsts[indexes[idy]]); subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]); } double cost = m_classifier.train(subInstances, subInstGoldActions); eval.overall_label_count += m_classifier._eval.overall_label_count; eval.correct_label_count += m_classifier._eval.correct_label_count; if ((iter + 1) % (m_options.verboseIter) == 0) { std::cout << "current: " << iter + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl; eval.reset(); bEvaluate = true; } m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps); } if (bEvaluate && devNum > 0) { bCurIterBetter = false; if (!m_options.outBest.empty()) decodeInstResults.clear(); segMetric_dev.reset(); postagMetric_dev.reset(); for (int idx = 0; idx < devInsts.size(); idx++) { predict(devInsts[idx], curDecodeInst); devInsts[idx].evaluate(curDecodeInst, segMetric_dev, postagMetric_dev); if (!m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "dev:" << std::endl << "Seg: "; segMetric_dev.print(); std::cout << "Postag: "; postagMetric_dev.print(); if (!m_options.outBest.empty() && postagMetric_dev.getAccuracy() > bestPostagFmeasure) { m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults); bCurIterBetter = true; } if (testNum > 0) { if (!m_options.outBest.empty()) decodeInstResults.clear(); segMetric_test.reset(); postagMetric_test.reset(); for (int idx = 0; idx < testInsts.size(); idx++) { predict(testInsts[idx], curDecodeInst); testInsts[idx].evaluate(curDecodeInst, segMetric_test, postagMetric_test); if (bCurIterBetter && !m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "test:" << std::endl << "Seg: "; segMetric_test.print(); std::cout << "Postag: "; postagMetric_test.print(); if (!m_options.outBest.empty() && bCurIterBetter) { m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults); } } for (int idx = 0; idx < otherInsts.size(); idx++) { std::cout << "processing " << m_options.testFiles[idx] << std::endl; if (!m_options.outBest.empty()) decodeInstResults.clear(); segMetric_test.reset(); postagMetric_test.reset(); for (int idy = 0; idy < otherInsts[idx].size(); idy++) { predict(otherInsts[idx][idy], curDecodeInst); otherInsts[idx][idy].evaluate(curDecodeInst, segMetric_test, postagMetric_test); if (bCurIterBetter && !m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "test:" << std::endl << "Seg: "; segMetric_test.print(); std::cout << "Postag: "; postagMetric_test.print(); if (!m_options.outBest.empty() && bCurIterBetter) { m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults); } } if (m_options.saveIntermediate && postagMetric_dev.getAccuracy() > bestPostagFmeasure) { std::cout << "Exceeds best previous DIS of " << bestPostagFmeasure << ". Saving model file.." << std::endl; bestPostagFmeasure = postagMetric_dev.getAccuracy(); writeModelFile(modelFile); } } } }
void Segmentor::train(const string& trainFile, const string& devFile, const string& testFile, const string& modelFile, const string& optionFile, const string& wordEmbFile, const string& charEmbFile, const string& bicharEmbFile) { if (optionFile != "") m_options.load(optionFile); m_options.showOptions(); vector<Instance> trainInsts, devInsts, testInsts; m_pipe.readInstances(trainFile, trainInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); if (devFile != "") m_pipe.readInstances(devFile, devInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); if (testFile != "") m_pipe.readInstances(testFile, testInsts, m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); vector<vector<Instance> > otherInsts(m_options.testFiles.size()); for (int idx = 0; idx < m_options.testFiles.size(); idx++) { m_pipe.readInstances(m_options.testFiles[idx], otherInsts[idx], m_classifier.MAX_SENTENCE_SIZE - 2, m_options.maxInstance); } createAlphabet(trainInsts); addTestWordAlpha(devInsts); addTestWordAlpha(testInsts); NRMat<dtype> wordEmb, allwordEmb; if (wordEmbFile != "") { allWordAlphaEmb(wordEmbFile, allwordEmb); } else { std::cout << "must not be here, allword must be pretrained." << std::endl; } wordEmb.resize(m_classifier.fe._wordAlphabet.size(), m_options.wordEmbSize); wordEmb.randu(1000); cout << "word emb dim is " << wordEmb.ncols() << std::endl; NRMat<dtype> charEmb; if (charEmbFile != "") { readEmbeddings(m_classifier.fe._charAlphabet, charEmbFile, charEmb); } else { charEmb.resize(m_classifier.fe._charAlphabet.size(), m_options.charEmbSize); charEmb.randu(2000); } cout << "char emb dim is " << charEmb.ncols() << std::endl; NRMat<dtype> bicharEmb; if (bicharEmbFile != "") { readEmbeddings(m_classifier.fe._bicharAlphabet, bicharEmbFile, bicharEmb); } else { bicharEmb.resize(m_classifier.fe._bicharAlphabet.size(), m_options.bicharEmbSize); bicharEmb.randu(2000); } cout << "bichar emb dim is " << bicharEmb.ncols() << std::endl; NRMat<dtype> actionEmb; actionEmb.resize(m_classifier.fe._actionAlphabet.size(), m_options.actionEmbSize); actionEmb.randu(3000); cout << "action emb dim is " << actionEmb.ncols() << std::endl; NRMat<dtype> lengthEmb; lengthEmb.resize(6, m_options.lengthEmbSize); lengthEmb.randu(3000); cout << "length emb dim is " << actionEmb.ncols() << std::endl; m_classifier.init(wordEmb, allwordEmb, lengthEmb, m_options.wordNgram, m_options.wordHiddenSize, m_options.wordRNNHiddenSize, charEmb, bicharEmb, m_options.charcontext, m_options.charHiddenSize, m_options.charRNNHiddenSize, actionEmb, m_options.actionNgram, m_options.actionHiddenSize, m_options.actionRNNHiddenSize, m_options.sepHiddenSize, m_options.appHiddenSize, m_options.delta); m_classifier.setDropValue(m_options.dropProb); m_classifier.setOOVFreq(m_options.wordCutOff); m_classifier.setOOVRatio(m_options.oovRatio); m_classifier.setWordFreq(m_word_stat); vector<vector<CAction> > trainInstGoldactions; getGoldActions(trainInsts, trainInstGoldactions); double bestFmeasure = 0; int inputSize = trainInsts.size(); std::vector<int> indexes; for (int i = 0; i < inputSize; ++i) indexes.push_back(i); static Metric eval, metric_dev, metric_test; int maxIter = m_options.maxIter * (inputSize / m_options.batchSize + 1); int oneIterMaxRound = (inputSize + m_options.batchSize -1) / m_options.batchSize; std::cout << "maxIter = " << maxIter << std::endl; int devNum = devInsts.size(), testNum = testInsts.size(); static vector<vector<string> > decodeInstResults; static vector<string> curDecodeInst; static bool bCurIterBetter; static vector<vector<string> > subInstances; static vector<vector<CAction> > subInstGoldActions; for (int iter = 0; iter < maxIter; ++iter) { std::cout << "##### Iteration " << iter << std::endl; srand(iter); random_shuffle(indexes.begin(), indexes.end()); std::cout << "random: " << indexes[0] << ", " << indexes[indexes.size() - 1] << std::endl; bool bEvaluate = false; if(m_options.batchSize == 1){ eval.reset(); bEvaluate = true; for (int idy = 0; idy < inputSize; idy++) { subInstances.clear(); subInstGoldActions.clear(); subInstances.push_back(trainInsts[indexes[idy]].chars); subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]); double cost = m_classifier.train(subInstances, subInstGoldActions); eval.overall_label_count += m_classifier._eval.overall_label_count; eval.correct_label_count += m_classifier._eval.correct_label_count; if ((idy + 1) % (m_options.verboseIter*10) == 0) { std::cout << "current: " << idy + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl; } m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps, m_options.clip); } std::cout << "current: " << iter + 1 << ", Correct(%) = " << eval.getAccuracy() << std::endl; } else{ if(iter == 0)eval.reset(); subInstances.clear(); subInstGoldActions.clear(); for (int idy = 0; idy < m_options.batchSize; idy++) { subInstances.push_back(trainInsts[indexes[idy]].chars); subInstGoldActions.push_back(trainInstGoldactions[indexes[idy]]); } double cost = m_classifier.train(subInstances, subInstGoldActions); eval.overall_label_count += m_classifier._eval.overall_label_count; eval.correct_label_count += m_classifier._eval.correct_label_count; if ((iter + 1) % (m_options.verboseIter) == 0) { std::cout << "current: " << iter + 1 << ", Cost = " << cost << ", Correct(%) = " << eval.getAccuracy() << std::endl; eval.reset(); bEvaluate = true; } m_classifier.updateParams(m_options.regParameter, m_options.adaAlpha, m_options.adaEps, m_options.clip); } if (bEvaluate && devNum > 0) { bCurIterBetter = false; if (!m_options.outBest.empty()) decodeInstResults.clear(); metric_dev.reset(); for (int idx = 0; idx < devInsts.size(); idx++) { predict(devInsts[idx], curDecodeInst); devInsts[idx].evaluate(curDecodeInst, metric_dev); if (!m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "dev:" << std::endl; metric_dev.print(); if (!m_options.outBest.empty() && metric_dev.getAccuracy() > bestFmeasure) { m_pipe.outputAllInstances(devFile + m_options.outBest, decodeInstResults); bCurIterBetter = true; } if (testNum > 0) { if (!m_options.outBest.empty()) decodeInstResults.clear(); metric_test.reset(); for (int idx = 0; idx < testInsts.size(); idx++) { predict(testInsts[idx], curDecodeInst); testInsts[idx].evaluate(curDecodeInst, metric_test); if (bCurIterBetter && !m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "test:" << std::endl; metric_test.print(); if (!m_options.outBest.empty() && bCurIterBetter) { m_pipe.outputAllInstances(testFile + m_options.outBest, decodeInstResults); } } for (int idx = 0; idx < otherInsts.size(); idx++) { std::cout << "processing " << m_options.testFiles[idx] << std::endl; if (!m_options.outBest.empty()) decodeInstResults.clear(); metric_test.reset(); for (int idy = 0; idy < otherInsts[idx].size(); idy++) { predict(otherInsts[idx][idy], curDecodeInst); otherInsts[idx][idy].evaluate(curDecodeInst, metric_test); if (bCurIterBetter && !m_options.outBest.empty()) { decodeInstResults.push_back(curDecodeInst); } } std::cout << "test:" << std::endl; metric_test.print(); if (!m_options.outBest.empty() && bCurIterBetter) { m_pipe.outputAllInstances(m_options.testFiles[idx] + m_options.outBest, decodeInstResults); } } if (m_options.saveIntermediate && metric_dev.getAccuracy() > bestFmeasure) { std::cout << "Exceeds best previous DIS of " << bestFmeasure << ". Saving model file.." << std::endl; bestFmeasure = metric_dev.getAccuracy(); writeModelFile(modelFile); } } } }