// all linear features are extracted from positive examples
int Segmentor::createAlphabet(const vector<Instance>& vecInsts) {
	cout << "Creating Alphabet..." << endl;

	int numInstance = vecInsts.size();

	hash_map<string, int> action_stat;
	hash_map<string, int> feat_stat;
	hash_map<string, int> postag_stat;

	assert(numInstance > 0);

	static Metric segEval, posEval;
	static CStateItem state[m_classifier.MAX_SENTENCE_SIZE];
	static Feature feat;
	static CResult output;
	static CAction answer;
	static int actionNum;
	m_classifier.initAlphabet();
	segEval.reset();
	posEval.reset();
	int maxFreqChar = -1;
	int maxFreqWord = -1;

	for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) {
		const Instance &instance = vecInsts[numInstance];
		for (int idx = 0; idx < instance.postagsize(); idx++) {
			postag_stat[instance.postags[idx]];
			m_classifier.fe._tagConstraints.addWordPOSPair(instance.words[idx], instance.postags[idx]);
		}
	}

	m_classifier.addToPostagAlphabet(postag_stat);

	for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) {
		const Instance &instance = vecInsts[numInstance];
		actionNum = 0;
		state[actionNum].initSentence(&instance.chars, &instance.candidateLabels);
		state[actionNum].clear();

		while (!state[actionNum].IsTerminated()) {
			state[actionNum].getGoldAction(instance, m_classifier.fe._postagAlphabet, answer);
			action_stat[answer.str()]++;

			m_classifier.extractFeature(state + actionNum, answer, feat);
			for (int idx = 0; idx < feat._strSparseFeat.size(); idx++) {
				feat_stat[feat._strSparseFeat[idx]]++;
			}
			state[actionNum].move(state + actionNum + 1, answer, m_classifier.fe._postagAlphabet);
			actionNum++;
		}

		if (actionNum - 1 != instance.charsize()) {
			std::cout << "action number is not correct, please check" << std::endl;
		}
		state[actionNum].getSegPosResults(output);

		instance.evaluate(output, segEval, posEval);

		if (!segEval.bIdentical() || !posEval.bIdentical()) {
			std::cout << "error state conversion!" << std::endl;
			std::cout << "output instance:" << std::endl;
			for (int tmpK = 0; tmpK < instance.words.size(); tmpK++) {
				std::cout << instance.words[tmpK] << "_" << instance.postags[tmpK] << " ";
			}
			std::cout << std::endl;

			std::cout << "predicated instance:" << std::endl;
			for (int tmpK = 0; tmpK < output.size(); tmpK++) {
				std::cout << output.words[tmpK] << "_" << output.postags[tmpK] << " ";
			}
			std::cout << std::endl;

			exit(0);
		}

		if ((numInstance + 1) % m_options.verboseIter == 0) {
			cout << numInstance + 1 << " ";
			if ((numInstance + 1) % (40 * m_options.verboseIter) == 0)
				cout << std::endl;
			cout.flush();
		}
		if (m_options.maxInstance > 0 && numInstance == m_options.maxInstance)
			break;
	}

	m_classifier.addToActionAlphabet(action_stat);
	m_classifier.addToFeatureAlphabet(feat_stat, m_options.featCutOff);

	cout << numInstance << " " << endl;
	cout << "Action num: " << m_classifier.fe._actionAlphabet.size() << endl;
	cout << "Pos num: " << m_classifier.fe._postagAlphabet.size() << endl;
	cout << "Total feat num: " << feat_stat.size() << endl;

	cout << "Remain feat num: " << m_classifier.fe._featAlphabet.size() << endl;

	//m_classifier.setFeatureCollectionState(false);

	return 0;
}
// all linear features are extracted from positive examples
int Segmentor::createAlphabet(const vector<Instance>& vecInsts) {
  cout << "Creating Alphabet..." << endl;

  int numInstance = vecInsts.size();

  hash_map<string, int> word_stat;
  hash_map<string, int> char_stat;
  hash_map<string, int> bichar_stat;
  hash_map<string, int> action_stat;
  hash_map<string, int> feat_stat;

  assert(numInstance > 0);

  static Metric eval;
  static CStateItem state[m_classifier.MAX_SENTENCE_SIZE];
  static Feature feat;
  static vector<string> output;
  static CAction answer;
  static int actionNum;
  m_classifier.initAlphabet();
  eval.reset();
  for (numInstance = 0; numInstance < vecInsts.size(); numInstance++) {
    const Instance &instance = vecInsts[numInstance];
    for (int idx = 0; idx < instance.wordsize(); idx++) {
      word_stat[normalize_to_lowerwithdigit(instance.words[idx])]++;
    }
    for (int idx = 0; idx < instance.charsize(); idx++) {
      char_stat[instance.chars[idx]]++;
    }
    for (int idx = 0; idx < instance.charsize() - 1; idx++) {
      bichar_stat[instance.chars[idx] + instance.chars[idx + 1]]++;
    }
    bichar_stat[instance.chars[instance.charsize() - 1] + m_classifier.fe.nullkey]++;
    bichar_stat[m_classifier.fe.nullkey + instance.chars[0]]++;
    actionNum = 0;
    state[actionNum].initSentence(&instance.chars);
    state[actionNum].clear();

    while (!state[actionNum].IsTerminated()) {
      state[actionNum].getGoldAction(instance.words, answer);
      action_stat[answer.str()]++;

      m_classifier.extractFeature(state+actionNum, answer, feat);
      for (int idx = 0; idx < feat._strSparseFeat.size(); idx++) {
        feat_stat[feat._strSparseFeat[idx]]++;
      }
      state[actionNum].move(state+actionNum+1, answer);
      actionNum++;
    }

    if(actionNum-1 != instance.charsize()) {
      std::cout << "action number is not correct, please check" << std::endl;
    }
    state[actionNum].getSegResults(output);

    instance.evaluate(output, eval);

    if (!eval.bIdentical()) {
      std::cout << "error state conversion!" << std::endl;
      exit(0);
    }

    if ((numInstance + 1) % m_options.verboseIter == 0) {
      cout << numInstance + 1 << " ";
      if ((numInstance + 1) % (40 * m_options.verboseIter) == 0)
        cout << std::endl;
      cout.flush();
    }
    if (m_options.maxInstance > 0 && numInstance == m_options.maxInstance)
      break;
  }

  m_classifier.addToActionAlphabet(action_stat);
  m_classifier.addToWordAlphabet(word_stat, m_options.wordEmbFineTune ? m_options.wordCutOff : 0);
  m_classifier.addToCharAlphabet(char_stat, m_options.charEmbFineTune ? m_options.charCutOff : 0);
  m_classifier.addToBiCharAlphabet(bichar_stat, m_options.tagEmbFineTune ? m_options.tagCutOff : 0);
  m_classifier.addToFeatureAlphabet(feat_stat, m_options.featCutOff);

  cout << numInstance << " " << endl;
  cout << "Action num: " << m_classifier.fe._actionAlphabet.size() << endl;
  cout << "Total word num: " << word_stat.size() << endl;
  cout << "Total char num: " << char_stat.size() << endl;
  cout << "Total bichar num: " << bichar_stat.size() << endl;
  cout << "Total feat num: " << feat_stat.size() << endl;

  cout << "Remain word num: " << m_classifier.fe._wordAlphabet.size() << endl;
  cout << "Remain char num: " << m_classifier.fe._charAlphabet.size() << endl;
  cout << "Remain bichar num: " << m_classifier.fe._bicharAlphabet.size() << endl;
  cout << "Remain feat num: " << m_classifier.fe._featAlphabet.size() << endl;

  //m_classifier.setFeatureCollectionState(false);

  return 0;
}