std::vector<uint64_t> getVocabIDs(const StringPiece &textin) { //Tokenize std::vector<uint64_t> output; util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); while (itWord) { StringPiece word = *itWord; uint64_t id = 0; util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); while (itFactor) { StringPiece factor = *itFactor; //cerr << "factor=" << factor << endl; id += getHash(factor); itFactor++; } output.push_back(id); itWord++; } return output; }
void add_to_map(StoreVocab<uint64_t> &sourceVocab, const StringPiece &textin) { //Tokenize util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' ')); while (itWord) { StringPiece word = *itWord; util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|')); while (itFactor) { StringPiece factor = *itFactor; sourceVocab.Insert(getHash(factor), factor.as_string()); itFactor++; } itWord++; } }