// all linear features are extracted from positive examples int Segmentor::createAlphabet(const vector<Instance>& vecInsts) { cout << "Creating Alphabet..." << endl; int numInstance = vecInsts.size(); hash_map<string, int> action_stat; hash_map<string, int> feat_stat; hash_map<string, int> postag_stat; assert(numInstance > 0); static Metric segEval, posEval; static CStateItem state[m_classifier.MAX_SENTENCE_SIZE]; static Feature feat; static CResult output; static CAction answer; static int actionNum; m_classifier.initAlphabet(); segEval.reset(); posEval.reset(); int maxFreqChar = -1; int maxFreqWord = -1; for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) { const Instance &instance = vecInsts[numInstance]; for (int idx = 0; idx < instance.postagsize(); idx++) { postag_stat[instance.postags[idx]]; m_classifier.fe._tagConstraints.addWordPOSPair(instance.words[idx], instance.postags[idx]); } } m_classifier.addToPostagAlphabet(postag_stat); for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) { const Instance &instance = vecInsts[numInstance]; actionNum = 0; state[actionNum].initSentence(&instance.chars, &instance.candidateLabels); state[actionNum].clear(); while (!state[actionNum].IsTerminated()) { state[actionNum].getGoldAction(instance, m_classifier.fe._postagAlphabet, answer); action_stat[answer.str()]++; m_classifier.extractFeature(state + actionNum, answer, feat); for (int idx = 0; idx < feat._strSparseFeat.size(); idx++) { feat_stat[feat._strSparseFeat[idx]]++; } state[actionNum].move(state + actionNum + 1, answer, m_classifier.fe._postagAlphabet); actionNum++; } if (actionNum - 1 != instance.charsize()) { std::cout << "action number is not correct, please check" << std::endl; } state[actionNum].getSegPosResults(output); instance.evaluate(output, segEval, posEval); if (!segEval.bIdentical() || !posEval.bIdentical()) { std::cout << "error state conversion!" << std::endl; std::cout << "output instance:" << std::endl; for (int tmpK = 0; tmpK < instance.words.size(); tmpK++) { std::cout << instance.words[tmpK] << "_" << instance.postags[tmpK] << " "; } std::cout << std::endl; std::cout << "predicated instance:" << std::endl; for (int tmpK = 0; tmpK < output.size(); tmpK++) { std::cout << output.words[tmpK] << "_" << output.postags[tmpK] << " "; } std::cout << std::endl; exit(0); } if ((numInstance + 1) % m_options.verboseIter == 0) { cout << numInstance + 1 << " "; if ((numInstance + 1) % (40 * m_options.verboseIter) == 0) cout << std::endl; cout.flush(); } if (m_options.maxInstance > 0 && numInstance == m_options.maxInstance) break; } m_classifier.addToActionAlphabet(action_stat); m_classifier.addToFeatureAlphabet(feat_stat, m_options.featCutOff); cout << numInstance << " " << endl; cout << "Action num: " << m_classifier.fe._actionAlphabet.size() << endl; cout << "Pos num: " << m_classifier.fe._postagAlphabet.size() << endl; cout << "Total feat num: " << feat_stat.size() << endl; cout << "Remain feat num: " << m_classifier.fe._featAlphabet.size() << endl; //m_classifier.setFeatureCollectionState(false); return 0; }
// all linear features are extracted from positive examples int Segmentor::createAlphabet(const vector<Instance>& vecInsts) { cout << "Creating Alphabet..." << endl; int numInstance = vecInsts.size(); hash_map<string, int> word_stat; hash_map<string, int> char_stat; hash_map<string, int> bichar_stat; hash_map<string, int> action_stat; hash_map<string, int> feat_stat; assert(numInstance > 0); static Metric eval; static CStateItem state[m_classifier.MAX_SENTENCE_SIZE]; static Feature feat; static vector<string> output; static CAction answer; static int actionNum; m_classifier.initAlphabet(); eval.reset(); for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) { const Instance &instance = vecInsts[numInstance]; for (int idx = 0; idx < instance.wordsize(); idx++) { word_stat[normalize_to_lowerwithdigit(instance.words[idx])]++; } for (int idx = 0; idx < instance.charsize(); idx++) { char_stat[instance.chars[idx]]++; } for (int idx = 0; idx < instance.charsize() - 1; idx++) { bichar_stat[instance.chars[idx] + instance.chars[idx + 1]]++; } bichar_stat[instance.chars[instance.charsize() - 1] + m_classifier.fe.nullkey]++; bichar_stat[m_classifier.fe.nullkey + instance.chars[0]]++; actionNum = 0; state[actionNum].initSentence(&instance.chars); state[actionNum].clear(); while (!state[actionNum].IsTerminated()) { state[actionNum].getGoldAction(instance.words, answer); action_stat[answer.str()]++; m_classifier.extractFeature(state+actionNum, answer, feat); for (int idx = 0; idx < feat._strSparseFeat.size(); idx++) { feat_stat[feat._strSparseFeat[idx]]++; } state[actionNum].move(state+actionNum+1, answer); actionNum++; } if(actionNum-1 != instance.charsize()) { std::cout << "action number is not correct, please check" << std::endl; } state[actionNum].getSegResults(output); instance.evaluate(output, eval); if (!eval.bIdentical()) { std::cout << "error state conversion!" << std::endl; exit(0); } if ((numInstance + 1) % m_options.verboseIter == 0) { cout << numInstance + 1 << " "; if ((numInstance + 1) % (40 * m_options.verboseIter) == 0) cout << std::endl; cout.flush(); } if (m_options.maxInstance > 0 && numInstance == m_options.maxInstance) break; } m_classifier.addToActionAlphabet(action_stat); m_classifier.addToWordAlphabet(word_stat, m_options.wordEmbFineTune ? m_options.wordCutOff : 0); m_classifier.addToCharAlphabet(char_stat, m_options.charEmbFineTune ? m_options.charCutOff : 0); m_classifier.addToBiCharAlphabet(bichar_stat, m_options.tagEmbFineTune ? m_options.tagCutOff : 0); m_classifier.addToFeatureAlphabet(feat_stat, m_options.featCutOff); cout << numInstance << " " << endl; cout << "Action num: " << m_classifier.fe._actionAlphabet.size() << endl; cout << "Total word num: " << word_stat.size() << endl; cout << "Total char num: " << char_stat.size() << endl; cout << "Total bichar num: " << bichar_stat.size() << endl; cout << "Total feat num: " << feat_stat.size() << endl; cout << "Remain word num: " << m_classifier.fe._wordAlphabet.size() << endl; cout << "Remain char num: " << m_classifier.fe._charAlphabet.size() << endl; cout << "Remain bichar num: " << m_classifier.fe._bicharAlphabet.size() << endl; cout << "Remain feat num: " << m_classifier.fe._featAlphabet.size() << endl; //m_classifier.setFeatureCollectionState(false); return 0; }