Ejemplo n.º 1
0
std::string SymForceAligner::alignSentenceStr(std::string e, std::string f) {
    Alignment a = alignSentence(e, f);

    std::stringstream ss;
    for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
        if(it != a.begin())
            ss << " ";
        ss << it->first << "-" << it->second;
    }
    return ss.str();
}
Ejemplo n.º 2
0
vector<Token> TokenizerRus::Tokenize(
		const wstring& sentence) const
{
    wstring replaced = replaceSpecialSymbols(sentence);
    wstring alignedSentence = alignSentence(replaced);
    wstring correctedSentece = applyMultipleTokensRule(alignedSentence);
    vector<wstring> splitted = Tools::Split(correctedSentece, L" ", L"\u00A0", L"\t");
    vector<Token> tokens;
    for (size_t splittedIndex = 0; splittedIndex < splitted.size();
        ++splittedIndex)
    {
        wstring current = splitted[splittedIndex];
        vector<wstring> punctuation;
        removePunctuation(&current, &punctuation);
        Token token(current, punctuation);
        tokens.push_back(token);
    }
    applyFirstSymbolPunctuationRule(&tokens);
    return tokens;
}