std::string SymForceAligner::alignSentenceStr(std::string e, std::string f) { Alignment a = alignSentence(e, f); std::stringstream ss; for(Alignment::iterator it = a.begin(); it != a.end(); it++) { if(it != a.begin()) ss << " "; ss << it->first << "-" << it->second; } return ss.str(); }
vector<Token> TokenizerRus::Tokenize( const wstring& sentence) const { wstring replaced = replaceSpecialSymbols(sentence); wstring alignedSentence = alignSentence(replaced); wstring correctedSentece = applyMultipleTokensRule(alignedSentence); vector<wstring> splitted = Tools::Split(correctedSentece, L" ", L"\u00A0", L"\t"); vector<Token> tokens; for (size_t splittedIndex = 0; splittedIndex < splitted.size(); ++splittedIndex) { wstring current = splitted[splittedIndex]; vector<wstring> punctuation; removePunctuation(¤t, &punctuation); Token token(current, punctuation); tokens.push_back(token); } applyFirstSymbolPunctuationRule(&tokens); return tokens; }