void ParallelCorpus::PrintSentence( const Sentence& sentence, const Vocab& vocab, std::ostream& out) const { if (sentence.size() > 0) { out << vocab.GetWord(sentence.at(0)); } for (int i = 1; i < sentence.size(); ++i) { out << " " << vocab.GetWord(sentence.at(i)); } }
void PackedTrie::Print(const Vocab& source_vocab, const Vocab& target_vocab, std::ostream& out) const { for (int s = 0; s < source_count_; ++s) { for (int i = offsets_[s]; i < offsets_[s + 1]; ++i) { out << source_vocab.GetWord(s) << "\t" << target_vocab.GetWord(target_words_[i]) << "\t" << exp(data_[i]) << std::endl; } } }
//Get P(W2 | W1) -- bigram double getBigramProb(const char *w1, const char *w2, Vocab &voc, Ngram &lm){ VocabIndex wid1 = voc.getIndex(w1); VocabIndex wid2 = voc.getIndex(w2); if(wid1 == Vocab_None) //OOV wid1 = voc.getIndex(Vocab_Unknown); if(wid2 == Vocab_None){ //OOV wid2 = voc.getIndex(Vocab_Unknown); return -20; } VocabIndex context[] = { wid1, Vocab_None }; return lm.wordProb( wid2, context); }
void Word::CreateFromString(const std::string &inString, Vocab &vocab) { if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") { // non-term m_isNonTerminal = true; string str = inString.substr(1, inString.size() - 2); m_vocabId = vocab.AddVocabId(str); } else { m_isNonTerminal = false; m_vocabId = vocab.AddVocabId(inString); } }
void CorpusReader::CreateVocabMap(const Vocab& corpus_vocab, const vector< vector<string> >& filter_vocab, vector<IntIntMap>* lookup) { assert(corpus_vocab.has_language()); int lang = corpus_vocab.language(); if (lang >= (int)lookup->size()) lookup->resize(lang + 1); if (filter_vocab[lang].size() > 0) { cout << "Adding vocab for language " << lang << "(" << corpus_vocab.terms().size() << ")" << endl; CreateFilteredMap(corpus_vocab, filter_vocab[lang], &(*lookup)[lang]); } else { cout << "Skipping language " << lang << endl; } }
void CorpusReader::CreateUnfilteredMap(const Vocab& proto_voc, StringIntMap* lookup, IntIntMap* mapping) { for (int ii = 0; ii < proto_voc.terms_size(); ++ii) { const lib_corpora_proto::Vocab_Entry& word = proto_voc.terms(ii); string term = word.original(); if (lookup->find(term) == lookup->end()) { int new_id = lookup->size(); (*lookup)[term] = new_id; // cout << "Adding " << term << " with id " << new_id << endl; } (*mapping)[word.id()] = (*lookup)[term]; // cout << "---------------" << endl; } }
void Word::ConvertToMoses( const std::vector<Moses::FactorType> &outputFactorsVec, const Vocab &vocab, Moses::Word &overwrite) const { Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance(); overwrite = Moses::Word(m_isNonTerminal); // TODO: this conversion should have been done at load time. util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|'); for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) { UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size()); overwrite.SetFactor(*t, factorColl.AddFactor(*tok)); } UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size()); }
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab) { if (filename.empty()) return; ifstream in(filename.c_str()); UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename); string line; while(getline(in,line)) { vocab.insert(FactorCollection::Instance().AddFactor(line)); } in.close(); }
void CorpusReader::CreateFilteredMap(const Vocab& corpus_voc, const vector<string>& filter_voc, IntIntMap* id_lookup) { map<string, int> new_id; // RHS will be new vocab for (int ii = 0; ii < (int)filter_voc.size(); ++ii) { new_id[filter_voc[ii]] = ii; } // LHS will be old vocab for (int ii = 0; ii < corpus_voc.terms_size(); ++ii) { const lib_corpora_proto::Vocab_Entry& word = corpus_voc.terms(ii); string term = word.original(); if (new_id.find(term) != new_id.end()) { (*id_lookup)[word.id()] = new_id[term]; // cout << word.id() << "->" << new_id[term] << "(term)" << endl; } } }
void extractBinaryfromStream(const char * inputStream, Vocab & textHash, vector < tuple <int *, int > > & src_batch, vector < tuple <int *, int > > & tgt_batch, int isFilter, int debugLines) { ifstream infile; infile.open(inputStream, ifstream::in); string line; int lineIdx = 0; while (getline(infile, line)) { stringstream linestream(line); string src, tgt; getline(linestream, src, '\t'); getline(linestream, tgt, '\t'); int src_token_num = 0; int tgt_token_num = 0; char** src_tokens = BasicUtil::TokenizeString(src, src_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN); char** tgt_tokens = BasicUtil::TokenizeString(tgt, tgt_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN); int * src_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM]; int * src_seg = new int[MAX_TOKEN_NUM]; int * tgt_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM]; int * tgt_seg = new int[MAX_TOKEN_NUM]; int src_seg_num = textHash.FeatureExtract((const char **)src_tokens, src_token_num, src_seg, src_fea); int tgt_seg_num = textHash.FeatureExtract((const char **)tgt_tokens, tgt_token_num, tgt_seg, tgt_fea); int src_feature_num = 0; //src_seg[src_seg_num - 1]; int tgt_feature_num = 0; //tgt_seg[tgt_seg_num - 1]; if(src_seg_num >= 1) { src_feature_num = src_seg[src_seg_num - 1]; } if(tgt_seg_num >= 1) { tgt_feature_num = tgt_seg[tgt_seg_num - 1]; } if(isFilter == 1) { if(src_feature_num <= 0 || tgt_feature_num <= 0) continue; } src_batch.push_back(tuple<int*, int>(src_fea, src_feature_num)); tgt_batch.push_back(tuple<int*, int>(tgt_fea, tgt_feature_num)); lineIdx += 1; if(lineIdx == debugLines) break; } }
float getH(MM_t* estmap, int s1, int s2){ //assert(estmap->size() == s1 * s2); cerr << "F_0 = " << vocab.size() << endl; cerr << "# of estimators = (actual) " << estmap->size() << " vs (defined) " << s1 * s2 << endl; float ests[s1*s2]; //iterate through basic estimators int idx(0); piterate(estmap, itr) { // should be unordered! ests[idx++] = X(itr->second); //store X() for each sample in the map //cerr << ests[idx -1] << endl; }
void Word::CreateFromString(const std::string &inString, Vocab &vocab) { if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") { // non-term m_isNonTerminal = true; } else { m_isNonTerminal = false; } m_factors.resize(1); m_factors[0] = vocab.AddVocabId(inString); }
treebank_minibatch_dataset convert_trees_to_indexed_minibatches( const Vocab& word_vocab, const std::vector<AnnotatedParseTree::shared_tree>& trees, int minibatch_size) { treebank_minibatch_dataset dataset; auto to_index_pair = [&word_vocab](std::pair<std::vector<std::string>, uint>&& pair, bool&& is_root) { return std::tuple<std::vector<uint>, uint, bool>( word_vocab.encode(pair.first), pair.second, is_root); }; if (dataset.size() == 0) dataset.emplace_back(0); for (auto& tree : trees) { // create new minibatch if (dataset[dataset.size()-1].size() == minibatch_size) { dataset.emplace_back(0); dataset.back().reserve(minibatch_size); } // add root dataset[dataset.size()-1].emplace_back( to_index_pair( tree->to_labeled_pair(), true ) ); // add children: for (auto& child : tree->general_children) { if (dataset[dataset.size()-1].size() == minibatch_size) { dataset.emplace_back(0); dataset.back().reserve(minibatch_size); } dataset[dataset.size()-1].emplace_back( to_index_pair( child->to_labeled_pair(), false ) ); } } return dataset; }
void add_example( const Vocab& vocab, const vector<string>& example_orig, size_t& example_idx) { int len = std::min(example_orig.size(), (size_t)FLAGS_max_sentence_length); vector<string> example(example_orig.begin(), example_orig.begin() + len); auto description_length = example.size(); this->data.w(0, example_idx) = vocab.word2index.at(START); auto encoded = vocab.encode(example, true); this->mask.w(0, example_idx) = 0.0; for (size_t j = 0; j < encoded.size(); j++) { this->data.w(j + 1, example_idx) = encoded[j]; this->mask.w(j + 1, example_idx) = (R)1.0; } this->code_lengths[example_idx] = description_length + 1; this->total_codes += description_length + 1; }
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab) { //cerr << line << endl; NgramCounter ngramCounts; list<WordVec> openNgrams; size_t length = 0; //tokenize & count for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) { const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j)); ++length; openNgrams.push_front(WordVec()); for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end(); ++k) { k->push_back(nextTok); ++ngramCounts[*k]; } if (openNgrams.size() >= kBleuNgramOrder) openNgrams.pop_back(); } //merge into overall ngram map for (NgramCounter::const_iterator ni = ngramCounts.begin(); ni != ngramCounts.end(); ++ni) { size_t count = ni->second; //cerr << *ni << " " << count << endl; if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1); NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first); if (totalsIter == ngramCounts_[sentenceId].end()) { ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count); } else { ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); //clip ngramCounts_[sentenceId][ni->first].second += count; //no clip } } //length if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1); //TODO - length strategy - this is MIN if (!lengths_[sentenceId]) { lengths_[sentenceId] = length; } else { lengths_[sentenceId] = min(length,lengths_[sentenceId]); } //cerr << endl; }
// Returns a vector of LiveGuessResults // warning: words is mutated temporarily std::auto_ptr< std::vector<LiveGuessResult> > forwardish(std::vector<const char *> & words, // the current words can be empty const double currentProb, // log prob const int size, // how many to grab const int depthLeft, const NgramLM & _lm, const int _order, const Vocab & vocab ) { // Index contains the last ngram word //Logger::Log(0, "Forwardish [%d] [%d]\n", depthLeft, index); VocabIndex vwords[ _order ]; //int n = (words.size() < (_order - 1))?words.size():_order; //for (int i = words.size() - _order - 1; i < words.size(); i++) { // if ( i >= 0) { // Logger::Log(0,"Word: %d %s\n",i,words[i]); // } //} //vwords[0] to _order -1 are filled in // if it's small EndOfSentence starts it.. for (int i = 1; i < _order; i++) { int j = words.size() - _order + i; if (j < 0) { vwords[i - 1] = Vocab::Invalid; // probably should be end of sentence } else { vwords[i - 1] = vocab.Find( words[ j ] ); } } vector<VocabProb> heap(0); mkHeap(heap); const ProbVector & probabilities = _lm.probs( _order ) ;// _order - 2 ); const CountVector & counts = _lm.counts( _order ); int count = 0; //Logger::Log(0, "Find probabilities %d\n",vocab.size()); for (int j = 0; j < vocab.size(); j++) { VocabIndex vWordI = j;//vocab[j]; vwords[ _order - 1 ] = j; NgramIndex newIndex = _lm.model()._Find( vwords, _order ); if (newIndex == -1) { // not legit :( continue; } Prob probRaw = probabilities[ newIndex ]; if (probRaw == 0.0) { continue; } Prob prob = -1 * log( probRaw ); //biggest is smallest //Prob prob = (probRaw == 0.0)?10000:(-1 * log( probRaw )); //biggest is smallest //Prob probRaw = (counts[newIndex]==0)?1.0:counts[newIndex]/vocab.size() //Prob prob = -1 * log(probRaw); //Prob prob = -1 * counts[newIndex]; //Logger::Log(0, "Prob %e\n",prob); const VocabProb v( prob,j, newIndex); if ( count < size ) { heap.push_back( v ); count++; if (count == size) { mkHeap( heap ); } // this is irritating, basically it means the highest rank stuff // will be in the list and we only kick out the lowest ranked stuff // (which will be the GREATEST of what is already there) // } else if ( heap.front().prob > prob ) { // this is dumb // remove the least element popHeap( heap ); pushHeap( heap, v ); // should we update? } } sortHeap( heap ); std::vector<LiveGuessResult> * resVector = new std::vector<LiveGuessResult>(); for( int j = 0; j < heap.size(); j++) { VocabProb v = heap[ j ]; Prob prob = v.prob; prob += currentProb; const char * word = vocab[ v.index ]; vector<const char *> ourWords(words); ourWords.push_back( word ); // add char * str = joinVectorOfCStrings( ourWords ); // Remember to deallocate later :( /*char * str = new char[strlen(word)+1]; CopyString(str, word); */ resVector->push_back( LiveGuessResult( prob , str )); } if ( depthLeft <= 0 ) { } else { //Let's recurse! for( int j = 0; j < heap.size(); j++) { VocabProb v = heap[ j ]; Prob prob = v.prob; prob += currentProb; words.push_back( vocab[ v.index ] ); std::auto_ptr< std::vector<LiveGuessResult> > r = forwardish( words, prob, size, depthLeft - 1, _lm, _order, vocab ); words.pop_back(); // and restore for (int i = 0; i < r->size(); i++) { resVector->push_back( (*r)[i] ); } } } std::auto_ptr< std::vector<LiveGuessResult> > returnValues( resVector ); return returnValues; }
void Word::DebugPrint(ostream &out, const Vocab &vocab) const { const string &str = vocab.GetString(m_vocabId); out << str; }
int main(int argc, char* argv[]) { Vocab vocab; Ngram lm(vocab, 2); vector<string> splitLine; map<string, set<string> > mapping; map<string, set<string> >::iterator map_iter; vector<string> BestLine; vector<string>::iterator Best_iter; FILE * TextFile; FILE * MapFile; char ch; char tmpstr[BUFSIZE]; for(int i=0 ; i<argc ; i++) { if(string("-text")==argv[i]) { TextFile = fopen(argv[i+1],"r"); } if(string("-map")==argv[i]) { MapFile = fopen(argv[i+1], "r"); } if(string("-lm")==argv[i]) { File lmFile(argv[i+1],"r"); lm.read(lmFile); lmFile.close(); } } //read MapFile into map<string, set<string> > mapping while(fgets(tmpstr,4096,MapFile)) { char *tok=strtok(tmpstr,"\n"); string Key,StringTok; set<string> ZhuYin; while(tok!=NULL) { StringTok=string(tok); Key = StringTok.substr(0,2);//read the first ZhuYin or ChuIn to key int pos; string tmpLine=StringTok.substr(3); while((pos = tmpLine.find(" "))!=-1) { tmpLine.erase(pos,1); } assert(tmpLine.size()%2==0); for(int i=0 ; i<tmpLine.size() ; i+=2) { string buf = tmpLine.substr(i, 2); ZhuYin.insert(buf); } mapping[Key]=ZhuYin; tok = strtok(NULL,"\n"); ZhuYin.clear(); } } //read TextFile into vector<string> splitLine int line =0; while(fgets(tmpstr,4096,TextFile)) { line++; char *tok=strtok(tmpstr,"\n");//Splite into one line string tmpLine = string(tok); while(tok!=NULL) { int pos; while((pos = tmpLine.find(" "))!=-1) { tmpLine.erase(pos,1); } assert(tmpLine.size()%2==0); for(int i=0 ; i<tmpLine.size() ; i+=2) { string buf = tmpLine.substr(i, 2); splitLine.push_back(buf);//push one word to splitLine } tok = strtok(NULL,"\n"); } splitLine.push_back("\n"); } int count = 1; //Viterbi for(int i=0;i<splitLine.size();i++) { set<string> TmpSet; if(i==0) { //cout << count << endl; BestLine.push_back("<s>"); BestLine.push_back(" "); } if(splitLine[i]=="\n") { count++; //cout << endl; //cout << count << endl; BestLine.push_back("</s>"); BestLine.push_back("\n"); BestLine.push_back("<s>"); BestLine.push_back(" "); } else { //cout << splitLine[i];//print every line without space map_iter = mapping.find(splitLine.at(i));//find the splitline[i] in mapping to map_iter //cout << (map_iter -> first).c_str() << endl; set<string> EveryPossibleZhuYin = map_iter -> second; //all possible ZhuYin in ChuIn set<string>::iterator iBegin=EveryPossibleZhuYin.begin(); string PreString; for(iBegin;iBegin!=EveryPossibleZhuYin.end();++iBegin) { //cout << *iBegin->c_str() <<endl; } //sleep(1); string TempString; double maxProb = -1000.0; VocabIndex wid; string best; for(set<string>::iterator i=EveryPossibleZhuYin.begin();i!=EveryPossibleZhuYin.end();++i) { TempString = *i; //cout << TempString.c_str() << endl; //sleep(1); VocabIndex context[]={vocab.getIndex(PreString.c_str()),Vocab_None}; wid=vocab.getIndex(TempString.c_str()); if(wid == Vocab_None) { //printf("Not in bigram.lm\n"); } else { double pro=lm.wordProb(wid,context); if(pro>maxProb) { best = TempString; maxProb=pro; } } } PreString=best; BestLine.push_back(PreString); BestLine.push_back(" "); } } for(vector<string>::iterator i=BestLine.begin();i!=(BestLine.end()-2);++i) { cout << *i ; } }
void ModelTrain() { Vocab vocab; vocab.LoadVocab("l3g.txt"); cout << "vocab Size " << vocab.VocabSize << endl; vector < tuple <int *, int > > src_batch, tgt_batch; extractBinaryfromStream("data//train_data_40k.tsv", vocab, src_batch, tgt_batch, 1, 0); int sampleSize = src_batch.size(); cout << "train sample size" << sampleSize << endl; int iteration = 30; int miniBatchSize = 1024; int featureDim = vocab.VocabSize; int batchNum = sampleSize / miniBatchSize; int nTrial = 4; vector <int> shuff(sampleSize); RunnerBehavior rb; rb.RunMode = RUNMODE_TRAIN; rb.Device = DEVICE_GPU; cout<<"init cuda computation ...."<<endl; rb.ComputeLib = new CudaOperationManager(true, true); cout<<"init cuda computation done"<<endl; int hiddenDim1 = 128; int hiddenDim2 = 128; SparseIndexMatrixStat srcMiniBatchInfo; srcMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize; srcMiniBatchInfo.MAX_COL_SIZE = featureDim; srcMiniBatchInfo.TOTAL_BATCH_NUM = batchNum; srcMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize; srcMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256; SparseIndexMatrixStat tgtMiniBatchInfo; tgtMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize; tgtMiniBatchInfo.MAX_COL_SIZE = featureDim; tgtMiniBatchInfo.TOTAL_BATCH_NUM = batchNum; tgtMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize; tgtMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256; DenseMatrixStat OutputLayer1Info; OutputLayer1Info.MAX_ROW_SIZE = miniBatchSize; OutputLayer1Info.MAX_COL_SIZE = hiddenDim1; OutputLayer1Info.TOTAL_BATCH_NUM = batchNum; OutputLayer1Info.TOTAL_SAMPLE_NUM = sampleSize; DenseMatrixStat OutputLayer2Info; OutputLayer2Info.MAX_ROW_SIZE = miniBatchSize; OutputLayer2Info.MAX_COL_SIZE = hiddenDim2; OutputLayer2Info.TOTAL_BATCH_NUM = batchNum; OutputLayer2Info.TOTAL_SAMPLE_NUM = sampleSize; FullyConnectedLayer srcLayer1(featureDim, hiddenDim1, &rb); FullyConnectedLayer srcLayer2(hiddenDim1, hiddenDim2, &rb); FullyConnectedLayer tgtLayer1(featureDim, hiddenDim1, &rb); FullyConnectedLayer tgtLayer2(hiddenDim1, hiddenDim2, &rb); DenseMatrixStat OutputSimInfo; OutputSimInfo.MAX_ROW_SIZE = miniBatchSize; OutputSimInfo.MAX_COL_SIZE = 1 + nTrial; OutputSimInfo.TOTAL_BATCH_NUM = batchNum; OutputSimInfo.TOTAL_SAMPLE_NUM = sampleSize; SparseIndexMatrix srcBatch(&srcMiniBatchInfo, rb.Device); HiddenDenseMatrix srcLayer1Data(&OutputLayer1Info, rb.Device); HiddenDenseMatrix srcLayer2Data(&OutputLayer2Info, rb.Device); SparseIndexMatrix tgtBatch(&tgtMiniBatchInfo, rb.Device); HiddenDenseMatrix tgtLayer1Data(&OutputLayer1Info, rb.Device); HiddenDenseMatrix tgtLayer2Data(&OutputLayer2Info, rb.Device); BiMatchData biMatchData(miniBatchSize, nTrial, rb.Device); SimilarityRunner similarityRunner(10, &rb); HiddenDenseMatrix simOutput(&OutputSimInfo, rb.Device); HiddenDenseMatrix probOutput(&OutputSimInfo, rb.Device); probOutput.Deriv->Data->Zero(); //iteration = 1; cout<<"start training iteration"<<endl; double train_time = 0; double io_time = 0; struct timeval train_start, train_end; struct timeval io_start, io_end; gettimeofday(&train_start, 0); for (int iter = 0; iter<iteration; iter++) { for (int i = 0; i<sampleSize; i++) shuff[i] = i; int shuffIdx = 0; float avgLoss = 0; for (int b = 0; b<batchNum; b++) { gettimeofday(&io_start, 0); srcBatch.Refresh(); tgtBatch.Refresh(); while (shuffIdx < sampleSize - 1 && srcBatch.RowSize < miniBatchSize && tgtBatch.RowSize < miniBatchSize) { int p = shuffIdx + rand() % (sampleSize - shuffIdx); int smpIdx = shuff[p]; shuff[p] = shuff[shuffIdx]; shuff[shuffIdx] = smpIdx; shuffIdx += 1; srcBatch.PushSample(get<0>(src_batch[smpIdx]), get<1>(src_batch[smpIdx])); tgtBatch.PushSample(get<0>(tgt_batch[smpIdx]), get<1>(tgt_batch[smpIdx])); } gettimeofday(&io_end, 0); io_time += io_end.tv_sec - io_start.tv_sec; //cout<<"src batch row "<< srcBatch.RowSize<<endl; //cout<<"src element size " <<srcBatch.ElementSize<<endl; //cout<<"tgt batch row "<< tgtBatch.RowSize<<endl; //cout<<"tgt element size " <<tgtBatch.ElementSize<<endl; //srcLayer1.Weight->SyncToHost(0, 100); //tgtLayer1.Weight->SyncToHost(0, 100); //for(int i=0;i<100;i++) //{ // cout<<"smpIdx "<< src.Weight->HostMem[i]<<endl; //} //cout<<"src weight "<<srcLayer1.Weight->HostMem[0]<<endl; //cout<<"tgt weight "<<tgtLayer1.Weight->HostMem[0]<<endl; //for(int i = 0; i< srcBatch.ElementSize; i++) //{ // srcBatch.SampleIdx //} //if( cudaSuccess != cudaGetLastError()) // cout <<"error 1"<<endl; srcLayer1.Forward(&srcBatch, srcLayer1Data.Output); //if( cudaSuccess != cudaGetLastError()) // cout <<"fdsfasdf"<<endl; //srcLayer1Data.Output->Data->SyncToHost(0,100); //cout<<"src 1 output"<<srcLayer1Data.Output->Data->HostMem[0]<<endl; srcLayer2.Forward(srcLayer1Data.Output, srcLayer2Data.Output); tgtLayer1.Forward(&tgtBatch, tgtLayer1Data.Output); tgtLayer2.Forward(tgtLayer1Data.Output, tgtLayer2Data.Output); biMatchData.GenerateMatch(srcBatch.RowSize); //srcLayer2Data.Output->Data->SyncToHost(0, srcLayer2Data.Stat->MAX_COL_SIZE * srcBatch.RowSize); //tgtLayer2Data.Output->Data->SyncToHost(0, tgtLayer2Data.Stat->MAX_COL_SIZE * tgtBatch.RowSize); //cout<<"src output"<<srcLayer2Data.Output->Data->HostMem[0]<<endl; //cout<<"tgt output"<<tgtLayer2Data.Output->Data->HostMem[0]<<endl; similarityRunner.Forward(srcLayer2Data.Output, tgtLayer2Data.Output, &biMatchData, simOutput.Output); //simOutput.Output->Data->SyncToHost(0, srcBatch.RowSize * 5); //for(int i=0;i<srcBatch.RowSize;i++) //{ // cout<<"sim"<< simOutput.Output->Data->HostMem[i]<<endl; // break; //} //break; rb.ComputeLib->SoftmaxForward(simOutput.Output->Data, probOutput.Output->Data, srcBatch.RowSize, simOutput.Stat->MAX_COL_SIZE); /// log softmax backward. probOutput.Deriv->Data --> biMatchData.MatchInfo rb.ComputeLib->VecAdd(probOutput.Output->Data, -1, biMatchData.MatchInfo, 1, simOutput.Deriv->Data, 0, biMatchData.MatchSize); //rb.ComputeLib->SoftmaxBackward(probOutput.Output->Data, probOutput.Deriv->Data, simOutput.Deriv->Data, srcBatch.RowSize, probOutput.Stat->MAX_COL_SIZE); /// output Loss. float loss = 0; //simOutput.Output->Data->QuickWatch(); //simOutput.Deriv->Data->QuickWatch(); probOutput.Output->Data->SyncToHost(0, srcBatch.RowSize * probOutput.Stat->MAX_COL_SIZE); // ->QuickWatch(); //probOutput.Deriv->Data->QuickWatch(); for(int i=0;i< srcBatch.RowSize; i++) { //cout<< probOutput.Output->Data->HostMem[i * probOutput.Stat->MAX_COL_SIZE]<<endl; loss += logf(probOutput.Output->Data->HostMem[i * probOutput.Stat->MAX_COL_SIZE] + LARGEEPS); } loss = loss / srcBatch.RowSize; avgLoss = b * 1.0f / (b + 1) * avgLoss + 1.0f / (b + 1) * loss; if((b+1) % 10 == 0) cout<<"mini batch : "<<b+1<<"\t avg loss :"<<avgLoss<<endl; //cout<<"current loss "<<loss<<endl; similarityRunner.Backward(simOutput.Deriv, srcLayer2Data.Deriv, tgtLayer2Data.Deriv); tgtLayer2.Backward(tgtLayer2Data.Deriv, tgtLayer2Data.Output, tgtLayer1Data.Deriv); tgtLayer1.Backward(tgtLayer1Data.Deriv, tgtLayer1Data.Output); srcLayer2.Backward(srcLayer2Data.Deriv, srcLayer2Data.Output, srcLayer1Data.Deriv); srcLayer1.Backward(srcLayer1Data.Deriv, srcLayer1Data.Output); /// update. tgtLayer2.Update(tgtLayer2Data.Deriv, tgtLayer1Data.Output); tgtLayer1.Update(tgtLayer1Data.Deriv, &tgtBatch); srcLayer2.Update(srcLayer2Data.Deriv, srcLayer1Data.Output); srcLayer1.Update(srcLayer1Data.Deriv, &srcBatch); } cout<<"iteration : "<<iter + 1<<"\t avg loss :"<<avgLoss<<endl; } gettimeofday(&train_end, 0); train_time = (train_end.tv_sec - train_start.tv_sec); cout<<"train overall time elipsed (sec):"<<train_time<<endl; cout<<"io time elipsed (sec):"<<io_time<<endl; cout<<"gpu time elipsed (sec):"<<train_time - io_time<<endl; ofstream modelWriter; modelWriter.open("model//dssm.v2.model", ofstream::binary); srcLayer1.Serialize(modelWriter); srcLayer2.Serialize(modelWriter); tgtLayer1.Serialize(modelWriter); tgtLayer2.Serialize(modelWriter); modelWriter.close(); }
void ModelPredict() { Vocab vocab; vocab.LoadVocab("l3g.txt"); cout << "vocab Size " << vocab.VocabSize << endl; vector < tuple <int *, int > > src_batch, tgt_batch; extractBinaryfromStream("data//test_data_clean.tsv", vocab, src_batch, tgt_batch, 0, 0); int sampleSize = src_batch.size(); cout << "test sample size" << sampleSize << endl; int miniBatchSize = 1024; int featureDim = vocab.VocabSize; int batchNum = (sampleSize - 1) / miniBatchSize + 1; RunnerBehavior rb; rb.RunMode = RUNMODE_PREDICT; rb.Device = DEVICE_GPU; rb.ComputeLib = new CudaOperationManager(true, true); int hiddenDim1 = 128; int hiddenDim2 = 128; SparseIndexMatrixStat srcMiniBatchInfo; srcMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize; srcMiniBatchInfo.MAX_COL_SIZE = featureDim; srcMiniBatchInfo.TOTAL_BATCH_NUM = batchNum; srcMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize; srcMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256; SparseIndexMatrixStat tgtMiniBatchInfo; tgtMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize; tgtMiniBatchInfo.MAX_COL_SIZE = featureDim; tgtMiniBatchInfo.TOTAL_BATCH_NUM = batchNum; tgtMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize; tgtMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256; DenseMatrixStat OutputLayer1Info; OutputLayer1Info.MAX_ROW_SIZE = miniBatchSize; OutputLayer1Info.MAX_COL_SIZE = hiddenDim1; OutputLayer1Info.TOTAL_BATCH_NUM = batchNum; OutputLayer1Info.TOTAL_SAMPLE_NUM = sampleSize; DenseMatrixStat OutputLayer2Info; OutputLayer2Info.MAX_ROW_SIZE = miniBatchSize; OutputLayer2Info.MAX_COL_SIZE = hiddenDim2; OutputLayer2Info.TOTAL_BATCH_NUM = batchNum; OutputLayer2Info.TOTAL_SAMPLE_NUM = sampleSize; ifstream modelReader; modelReader.open("model//dssm.v2.model", ofstream::binary); FullyConnectedLayer srcLayer1(modelReader, &rb); FullyConnectedLayer srcLayer2(modelReader, &rb); FullyConnectedLayer tgtLayer1(modelReader, &rb); FullyConnectedLayer tgtLayer2(modelReader, &rb); modelReader.close(); DenseMatrixStat OutputSimInfo; OutputSimInfo.MAX_ROW_SIZE = miniBatchSize; OutputSimInfo.MAX_COL_SIZE = 1; OutputSimInfo.TOTAL_BATCH_NUM = batchNum; OutputSimInfo.TOTAL_SAMPLE_NUM = sampleSize; SparseIndexMatrix srcBatch(&srcMiniBatchInfo, rb.Device); HiddenDenseMatrix srcLayer1Data(&OutputLayer1Info, rb.Device); HiddenDenseMatrix srcLayer2Data(&OutputLayer2Info, rb.Device); SparseIndexMatrix tgtBatch(&tgtMiniBatchInfo, rb.Device); HiddenDenseMatrix tgtLayer1Data(&OutputLayer1Info, rb.Device); HiddenDenseMatrix tgtLayer2Data(&OutputLayer2Info, rb.Device); BiMatchData biMatchData(miniBatchSize, 0, rb.Device); SimilarityRunner similarityRunner(10, &rb); HiddenDenseMatrix simOutput(&OutputSimInfo, rb.Device); HiddenDenseMatrix probOutput(&OutputSimInfo, rb.Device); ofstream outfile; outfile.open("data//test_data.v2.result", ofstream::out); int smpIdx = 0; for (int b = 0; b<batchNum; b++) { srcBatch.Refresh(); tgtBatch.Refresh(); while (smpIdx < sampleSize && srcBatch.RowSize < miniBatchSize && tgtBatch.RowSize < miniBatchSize) { srcBatch.PushSample(get<0>(src_batch[smpIdx]), get<1>(src_batch[smpIdx])); tgtBatch.PushSample(get<0>(tgt_batch[smpIdx]), get<1>(tgt_batch[smpIdx])); smpIdx++; } srcLayer1.Forward(&srcBatch, srcLayer1Data.Output); srcLayer2.Forward(srcLayer1Data.Output, srcLayer2Data.Output); tgtLayer1.Forward(&tgtBatch, tgtLayer1Data.Output); tgtLayer2.Forward(tgtLayer1Data.Output, tgtLayer2Data.Output); biMatchData.GenerateMatch(srcBatch.RowSize); similarityRunner.Forward(srcLayer2Data.Output, tgtLayer2Data.Output, &biMatchData, simOutput.Output); simOutput.Output->Data->QuickWatch(); //probOutput.Deriv->Data->QuickWatch(); for(int i=0;i< srcBatch.RowSize; i++) outfile<<simOutput.Output->Data->HostMem[i]<<endl; //cout<<srcBatch.RowSize<<"\t"<<smpIdx<<endl; if((b+1) % 10 == 0) cout<<"mini batch : "<<b+1<<" sample number "<<smpIdx<<endl; } outfile.close(); }
void LinkableValueNode::set_children_vocab(const Vocab &newvocab) { children_vocab.assign(newvocab.begin(),newvocab.end()); }
void PackedTrie::InitializeFromCorpus(const vector<const ParallelCorpus*>& pcs, const Vocab& total_source_vocab, const Vocab& total_target_vocab) { is_dependent_ = false; source_count_ = total_source_vocab.size(); target_count_ = total_target_vocab.size(); // For each source word, keep track of how many possible target words it can // generate. The null word can generate anything. vector<unordered_set<int> > targets_per_source; targets_per_source.resize(source_count_); // Each source word can generate the OOV word, which always has id 0. for (int s = 0; s < source_count_; ++s) { targets_per_source[s].insert(0); } for (int i = 0; i < pcs.size(); ++i) { for (int j = 0; j < pcs.at(i)->size(); ++j) { // Put all of the source and target words in the document pair into sets. const DocumentPair& doc_pair = pcs.at(i)->GetDocPair(j); unordered_set<int> source_words, target_words; for (int t = 0; t < doc_pair.second.size(); ++t) { const Sentence& sentence = doc_pair.second[t]; for (int w = 0; w < sentence.size(); ++w) { target_words.insert(sentence[w]); } } for (int s = 0; s < doc_pair.first.size(); ++s) { const Sentence& sentence = doc_pair.first[s]; for (int w = 0; w < sentence.size(); ++w) { source_words.insert(sentence[w]); } } unordered_set<int>::const_iterator s_it, t_it; for (s_it = source_words.begin(); s_it != source_words.end(); ++s_it) { for (t_it = target_words.begin(); t_it != target_words.end(); ++t_it) { targets_per_source[*s_it].insert(*t_it); } } } } assert(targets_per_source.size() > 0); offsets_ = new int[source_count_ + 1]; // Null word offsets_[0] = 0; total_size_ = target_count_; for (int i = 1; i < targets_per_source.size(); ++i) { offsets_[i] = total_size_; total_size_ += targets_per_source[i].size(); } offsets_[source_count_] = total_size_; target_words_ = new int[total_size_]; data_ = new double[total_size_]; // Add the entries // Null word entries for (int i = 0; i < target_count_; ++i) { target_words_[i] = i; data_[i] = log(1.0 / target_count_); } for (int s = 1; s < targets_per_source.size(); ++s) { int index = offsets_[s]; unordered_set<int>::const_iterator it = targets_per_source[s].begin(); for ( ; it != targets_per_source[s].end(); ++it) { target_words_[index] = *it; data_[index] = log(1.0 / targets_per_source[s].size()); ++index; } qsort(target_words_ + offsets_[s], offsets_[s+1] - offsets_[s], sizeof(int), PackedTrie::int_cmp); } for (int s = 0; s < source_count_; ++s) { int last = -1; //cout << "Source word: " << total_source_vocab.GetWord(s) << endl; for (int i = offsets_[s]; i < offsets_[s+1]; ++i) { //cout << total_target_vocab.GetWord(target_words_[i]) << ":" // << target_words_[i] << ":" << data_[i] << " "; if (last >= target_words_[i]) { cout << "Error on source word " << s << endl; } last = target_words_[i]; } //cout << endl; } }