Exemplo n.º 1
0
void split_to_pair(const string& str, vector< pair<string, string> >& vecPair)
{
	assert(vecPair.empty());
	vector<string> vec;
	split_bychar(str, vec);
	convert_to_pair(vec, vecPair);
}
Exemplo n.º 2
0
bool DepSRL::IsPosPattern(
        int intBegin,
        int intEnd,
        const vector<string>& vecPos,
        const string& strPattern) const
{
    vector<string> vecItem;
    split_bychar(strPattern, vecItem, C_PATTERN_SEP);

    for (int index = intBegin; index < intEnd; index++)
    {
        if ( find(vecItem.begin(), vecItem.end(), vecPos.at(index)) == vecItem.end() )
        {
            return 0;
        }
    }

    return 1;
}
void Labeler::readWordEmbeddings(const string& inFile, NRMat<dtype>& wordEmb) {
  static ifstream inf;
  if (inf.is_open()) {
    inf.close();
    inf.clear();
  }
  inf.open(inFile.c_str());

  static string strLine, curWord;
  static int wordId;

  //find the first line, decide the wordDim;
  while (1) {
    if (!my_getline(inf, strLine)) {
      break;
    }
    if (!strLine.empty())
      break;
  }

  int unknownId = m_wordAlphabet.from_string(unknownkey);

  static vector<string> vecInfo;
  split_bychar(strLine, vecInfo, ' ');
  int wordDim = vecInfo.size() - 1;

  std::cout << "word embedding dim is " << wordDim << std::endl;
  m_options.wordEmbSize = wordDim;

  wordEmb.resize(m_wordAlphabet.size(), wordDim);
  wordEmb = 0.0;
  curWord = normalize_to_lowerwithdigit(vecInfo[0]);
  wordId = m_wordAlphabet.from_string(curWord);
  hash_set<int> indexers;
  dtype sum[wordDim];
  int count = 0;
  bool bHasUnknown = false;
  if (wordId >= 0) {
    count++;
    if (unknownId == wordId)
      bHasUnknown = true;
    indexers.insert(wordId);
    for (int idx = 0; idx < wordDim; idx++) {
      dtype curValue = atof(vecInfo[idx + 1].c_str());
      sum[idx] = curValue;
      wordEmb[wordId][idx] = curValue;
    }

  } else {
    for (int idx = 0; idx < wordDim; idx++) {
      sum[idx] = 0.0;
    }
  }

  while (1) {
    if (!my_getline(inf, strLine)) {
      break;
    }
    if (strLine.empty())
      continue;
    split_bychar(strLine, vecInfo, ' ');
    if (vecInfo.size() != wordDim + 1) {
      std::cout << "error embedding file" << std::endl;
    }
    curWord = normalize_to_lowerwithdigit(vecInfo[0]);
    wordId = m_wordAlphabet.from_string(curWord);
    if (wordId >= 0) {
      count++;
      if (unknownId == wordId)
        bHasUnknown = true;
      indexers.insert(wordId);

      for (int idx = 0; idx < wordDim; idx++) {
        dtype curValue = atof(vecInfo[idx + 1].c_str());
        sum[idx] += curValue;
        wordEmb[wordId][idx] += curValue;
      }
    }

  }

  if (!bHasUnknown) {
    for (int idx = 0; idx < wordDim; idx++) {
      wordEmb[unknownId][idx] = sum[idx] / count;
    }
    count++;
    std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl;
  }

  int oovWords = 0;
  int totalWords = 0;
  for (int id = 0; id < m_wordAlphabet.size(); id++) {
    if (indexers.find(id) == indexers.end()) {
      oovWords++;
      for (int idx = 0; idx < wordDim; idx++) {
        wordEmb[id][idx] = wordEmb[unknownId][idx];
      }
    }
    totalWords++;
  }

  std::cout << "OOV num is " << oovWords << ", total num is " << m_wordAlphabet.size() << ", embedding oov ratio is " << oovWords * 1.0 / m_wordAlphabet.size()
      << std::endl;

}
int Segmentor::allWordAlphaEmb(const string& inFile, NRMat<dtype>& emb) {
  cout << "All word  alphabet and emb creating..." << endl;

  hash_map<string, int> word_stat;
  
  static ifstream inf;
  if (inf.is_open()) {
    inf.close();
    inf.clear();
  }
  inf.open(inFile.c_str());

  static string strLine, curWord;
  static int wordId;
  static vector<string> vecInfo;
  vector<string> allLines;

  int wordDim = 0;
  while (1) {
    if (!my_getline(inf, strLine)) {
      break;
    }
    if (!strLine.empty()){
      split_bychar(strLine, vecInfo, ' '); 
      if(wordDim == 0){
      	wordDim = vecInfo.size() - 1;
      	std::cout << "allword embedding dim is " << wordDim << std::endl;
      }
      curWord = normalize_to_lowerwithdigit(vecInfo[0]);
      word_stat[curWord]++;
      allLines.push_back(strLine);
    }
  }

  m_classifier.addToAllWordAlphabet(word_stat);
  cout << "Remain all word num: " << m_classifier.fe._allwordAlphabet.size() << endl;
  
  emb.resize(m_classifier.fe._allwordAlphabet.size(), wordDim);
  emb = 0.0;
  
  int unknownId = m_classifier.fe._allwordAlphabet.from_string(m_classifier.fe.unknownkey);
  dtype sum[wordDim];
  int count = 0;
  bool bHasUnknown = false;
  for (int idx = 0; idx < wordDim; idx++) {
    sum[idx] = 0.0;
  }
  
  for(int idx = 0; idx < allLines.size(); idx++){
    split_bychar(allLines[idx], vecInfo, ' ');
    if (vecInfo.size() != wordDim + 1) {
      std::cout << "error embedding file" << std::endl;
    }
    curWord = normalize_to_lowerwithdigit(vecInfo[0]);
    wordId = m_classifier.fe._allwordAlphabet.from_string(curWord);
    if (wordId >= 0) {
      count++;
      if (unknownId == wordId)
        bHasUnknown = true;

      for (int idx = 0; idx < wordDim; idx++) {
        dtype curValue = atof(vecInfo[idx + 1].c_str());
        sum[idx] += curValue;
        emb[wordId][idx] += curValue;
      }
    }
    else{
    	std::cout << "read all word embedding strange...." << std::endl;
    }	
  	
  }  
  
  if (!bHasUnknown) {
    for (int idx = 0; idx < wordDim; idx++) {
      emb[unknownId][idx] = sum[idx] / count;
    }
    count++;
    std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl;
  }

  return 0;
}