void split_to_pair(const string& str, vector< pair<string, string> >& vecPair) { assert(vecPair.empty()); vector<string> vec; split_bychar(str, vec); convert_to_pair(vec, vecPair); }
bool DepSRL::IsPosPattern( int intBegin, int intEnd, const vector<string>& vecPos, const string& strPattern) const { vector<string> vecItem; split_bychar(strPattern, vecItem, C_PATTERN_SEP); for (int index = intBegin; index < intEnd; index++) { if ( find(vecItem.begin(), vecItem.end(), vecPos.at(index)) == vecItem.end() ) { return 0; } } return 1; }
void Labeler::readWordEmbeddings(const string& inFile, NRMat<dtype>& wordEmb) { static ifstream inf; if (inf.is_open()) { inf.close(); inf.clear(); } inf.open(inFile.c_str()); static string strLine, curWord; static int wordId; //find the first line, decide the wordDim; while (1) { if (!my_getline(inf, strLine)) { break; } if (!strLine.empty()) break; } int unknownId = m_wordAlphabet.from_string(unknownkey); static vector<string> vecInfo; split_bychar(strLine, vecInfo, ' '); int wordDim = vecInfo.size() - 1; std::cout << "word embedding dim is " << wordDim << std::endl; m_options.wordEmbSize = wordDim; wordEmb.resize(m_wordAlphabet.size(), wordDim); wordEmb = 0.0; curWord = normalize_to_lowerwithdigit(vecInfo[0]); wordId = m_wordAlphabet.from_string(curWord); hash_set<int> indexers; dtype sum[wordDim]; int count = 0; bool bHasUnknown = false; if (wordId >= 0) { count++; if (unknownId == wordId) bHasUnknown = true; indexers.insert(wordId); for (int idx = 0; idx < wordDim; idx++) { dtype curValue = atof(vecInfo[idx + 1].c_str()); sum[idx] = curValue; wordEmb[wordId][idx] = curValue; } } else { for (int idx = 0; idx < wordDim; idx++) { sum[idx] = 0.0; } } while (1) { if (!my_getline(inf, strLine)) { break; } if (strLine.empty()) continue; split_bychar(strLine, vecInfo, ' '); if (vecInfo.size() != wordDim + 1) { std::cout << "error embedding file" << std::endl; } curWord = normalize_to_lowerwithdigit(vecInfo[0]); wordId = m_wordAlphabet.from_string(curWord); if (wordId >= 0) { count++; if (unknownId == wordId) bHasUnknown = true; indexers.insert(wordId); for (int idx = 0; idx < wordDim; idx++) { dtype curValue = atof(vecInfo[idx + 1].c_str()); sum[idx] += curValue; wordEmb[wordId][idx] += curValue; } } } if (!bHasUnknown) { for (int idx = 0; idx < wordDim; idx++) { wordEmb[unknownId][idx] = sum[idx] / count; } count++; std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl; } int oovWords = 0; int totalWords = 0; for (int id = 0; id < m_wordAlphabet.size(); id++) { if (indexers.find(id) == indexers.end()) { oovWords++; for (int idx = 0; idx < wordDim; idx++) { wordEmb[id][idx] = wordEmb[unknownId][idx]; } } totalWords++; } std::cout << "OOV num is " << oovWords << ", total num is " << m_wordAlphabet.size() << ", embedding oov ratio is " << oovWords * 1.0 / m_wordAlphabet.size() << std::endl; }
int Segmentor::allWordAlphaEmb(const string& inFile, NRMat<dtype>& emb) { cout << "All word alphabet and emb creating..." << endl; hash_map<string, int> word_stat; static ifstream inf; if (inf.is_open()) { inf.close(); inf.clear(); } inf.open(inFile.c_str()); static string strLine, curWord; static int wordId; static vector<string> vecInfo; vector<string> allLines; int wordDim = 0; while (1) { if (!my_getline(inf, strLine)) { break; } if (!strLine.empty()){ split_bychar(strLine, vecInfo, ' '); if(wordDim == 0){ wordDim = vecInfo.size() - 1; std::cout << "allword embedding dim is " << wordDim << std::endl; } curWord = normalize_to_lowerwithdigit(vecInfo[0]); word_stat[curWord]++; allLines.push_back(strLine); } } m_classifier.addToAllWordAlphabet(word_stat); cout << "Remain all word num: " << m_classifier.fe._allwordAlphabet.size() << endl; emb.resize(m_classifier.fe._allwordAlphabet.size(), wordDim); emb = 0.0; int unknownId = m_classifier.fe._allwordAlphabet.from_string(m_classifier.fe.unknownkey); dtype sum[wordDim]; int count = 0; bool bHasUnknown = false; for (int idx = 0; idx < wordDim; idx++) { sum[idx] = 0.0; } for(int idx = 0; idx < allLines.size(); idx++){ split_bychar(allLines[idx], vecInfo, ' '); if (vecInfo.size() != wordDim + 1) { std::cout << "error embedding file" << std::endl; } curWord = normalize_to_lowerwithdigit(vecInfo[0]); wordId = m_classifier.fe._allwordAlphabet.from_string(curWord); if (wordId >= 0) { count++; if (unknownId == wordId) bHasUnknown = true; for (int idx = 0; idx < wordDim; idx++) { dtype curValue = atof(vecInfo[idx + 1].c_str()); sum[idx] += curValue; emb[wordId][idx] += curValue; } } else{ std::cout << "read all word embedding strange...." << std::endl; } } if (!bHasUnknown) { for (int idx = 0; idx < wordDim; idx++) { emb[unknownId][idx] = sum[idx] / count; } count++; std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl; } return 0; }