void ParallelCorpus::PrintSentence(
    const Sentence& sentence, const Vocab& vocab, std::ostream& out) const {
  if (sentence.size() > 0) {
    out << vocab.GetWord(sentence.at(0));
  }
  for (int i = 1; i < sentence.size(); ++i) {
    out << " " << vocab.GetWord(sentence.at(i));
  }
}
void PackedTrie::Print(const Vocab& source_vocab, const Vocab& target_vocab,
    std::ostream& out) const {
  for (int s = 0; s < source_count_; ++s) {
    for (int i = offsets_[s]; i < offsets_[s + 1]; ++i) {
      out << source_vocab.GetWord(s) << "\t"
          << target_vocab.GetWord(target_words_[i]) << "\t"
          << exp(data_[i]) << std::endl;
    }
  }
}
//Get P(W2 | W1) -- bigram
double getBigramProb(const char *w1, const char *w2, Vocab &voc, Ngram &lm){
	VocabIndex wid1 = voc.getIndex(w1);
	VocabIndex wid2 = voc.getIndex(w2);
	if(wid1 == Vocab_None)  //OOV
		wid1 = voc.getIndex(Vocab_Unknown);
	if(wid2 == Vocab_None){  //OOV
		wid2 = voc.getIndex(Vocab_Unknown);
		return -20;
	}
	VocabIndex context[] = { wid1, Vocab_None };
	return lm.wordProb( wid2, context);
}
Beispiel #4
0
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
  if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]") {
    // non-term
    m_isNonTerminal = true;
    string str = inString.substr(1, inString.size() - 2);
    m_vocabId = vocab.AddVocabId(str);
  } else {
    m_isNonTerminal = false;
    m_vocabId = vocab.AddVocabId(inString);
  }

}
Beispiel #5
0
void CorpusReader::CreateVocabMap(const Vocab& corpus_vocab,
                                  const vector< vector<string> >& filter_vocab,
                                  vector<IntIntMap>* lookup) {
  assert(corpus_vocab.has_language());
  int lang = corpus_vocab.language();
  if (lang >= (int)lookup->size()) lookup->resize(lang + 1);
  if (filter_vocab[lang].size() > 0) {
    cout << "Adding vocab for language " << lang << "(" <<
      corpus_vocab.terms().size() << ")" << endl;
    CreateFilteredMap(corpus_vocab, filter_vocab[lang], &(*lookup)[lang]);
  } else {
    cout << "Skipping language " << lang << endl;
  }
}
Beispiel #6
0
void CorpusReader::CreateUnfilteredMap(const Vocab& proto_voc,
                                       StringIntMap* lookup,
                                       IntIntMap* mapping) {
  for (int ii = 0; ii < proto_voc.terms_size(); ++ii) {
    const lib_corpora_proto::Vocab_Entry& word = proto_voc.terms(ii);
    string term = word.original();
    if (lookup->find(term) == lookup->end()) {
      int new_id = lookup->size();
      (*lookup)[term] = new_id;
      // cout << "Adding " << term << " with id " << new_id << endl;
    }
    (*mapping)[word.id()] = (*lookup)[term];
    // cout << "---------------" << endl;
  }
}
Beispiel #7
0
void Word::ConvertToMoses(
    const std::vector<Moses::FactorType> &outputFactorsVec, 
    const Vocab &vocab,
    Moses::Word &overwrite) const {
  Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
  overwrite = Moses::Word(m_isNonTerminal);

  // TODO: this conversion should have been done at load time.  
  util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');

  for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
    UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
    overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
  }
  UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
{
  if (filename.empty()) return;
  ifstream in(filename.c_str());
  UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
  string line;
  while(getline(in,line)) {
    vocab.insert(FactorCollection::Instance().AddFactor(line)); 
  }
  in.close();
}
Beispiel #9
0
void CorpusReader::CreateFilteredMap(const Vocab& corpus_voc,
                                     const vector<string>& filter_voc,
                                     IntIntMap* id_lookup) {
  map<string, int> new_id;

  // RHS will be new vocab
  for (int ii = 0; ii < (int)filter_voc.size(); ++ii) {
    new_id[filter_voc[ii]] = ii;
  }

  // LHS will be old vocab
  for (int ii = 0; ii < corpus_voc.terms_size(); ++ii) {
    const lib_corpora_proto::Vocab_Entry& word = corpus_voc.terms(ii);
    string term = word.original();
    if (new_id.find(term) != new_id.end()) {
      (*id_lookup)[word.id()] = new_id[term];
      // cout << word.id() << "->" << new_id[term] << "(term)" << endl;
    }
  }
}
Beispiel #10
0
void extractBinaryfromStream(const char * inputStream, Vocab & textHash,
		vector < tuple <int *, int > > & src_batch, vector < tuple <int *, int > > & tgt_batch, int isFilter, int debugLines)
{
	ifstream infile;
	infile.open(inputStream, ifstream::in);
	string line;
	int lineIdx = 0;
	while (getline(infile, line))
	{
		stringstream linestream(line);
		string src, tgt;
		getline(linestream, src, '\t');
		getline(linestream, tgt, '\t');

		int src_token_num = 0;
		int tgt_token_num = 0;
		char** src_tokens = BasicUtil::TokenizeString(src, src_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN);
		char** tgt_tokens = BasicUtil::TokenizeString(tgt, tgt_token_num, MAX_TOKEN_NUM, MAX_TOKEN_LEN);

		int * src_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM];
		int * src_seg = new int[MAX_TOKEN_NUM];

		int * tgt_fea = new int[MAX_TOKEN_LEN * MAX_TOKEN_NUM];
		int * tgt_seg = new int[MAX_TOKEN_NUM];

		int src_seg_num = textHash.FeatureExtract((const char **)src_tokens, src_token_num, src_seg, src_fea);
		int tgt_seg_num = textHash.FeatureExtract((const char **)tgt_tokens, tgt_token_num, tgt_seg, tgt_fea);
		
		int src_feature_num = 0; //src_seg[src_seg_num - 1];
		int tgt_feature_num = 0; //tgt_seg[tgt_seg_num - 1];
		
		if(src_seg_num >= 1)
		{
		    src_feature_num = src_seg[src_seg_num - 1];
		}
		
		if(tgt_seg_num >= 1)
		{
		    tgt_feature_num = tgt_seg[tgt_seg_num - 1];
		}
		
		if(isFilter == 1)
		{
		    if(src_feature_num <= 0 || tgt_feature_num <= 0) continue;
		}
		
		src_batch.push_back(tuple<int*, int>(src_fea, src_feature_num));
		tgt_batch.push_back(tuple<int*, int>(tgt_fea, tgt_feature_num));

		lineIdx += 1;
		if(lineIdx == debugLines) break;
	}
}
Beispiel #11
0
float getH(MM_t* estmap, int s1, int s2){
  //assert(estmap->size() == s1 * s2);
  cerr << "F_0 = " << vocab.size() << endl;
  cerr << "# of estimators = (actual) " << estmap->size() << 
    " vs (defined) " << s1 * s2 << endl;
  float ests[s1*s2];
  //iterate through basic estimators
  int idx(0);
  piterate(estmap, itr) { // should be unordered!
    ests[idx++] = X(itr->second); //store X() for each sample in the map
    //cerr << ests[idx -1] << endl;
  }
Beispiel #12
0
void Word::CreateFromString(const std::string &inString, Vocab &vocab)
{
	if (inString.substr(0, 1) == "[" && inString.substr(inString.size() - 1, 1) == "]")
	{ // non-term
		m_isNonTerminal = true;
	}
	else
	{
		m_isNonTerminal = false;
	}

	m_factors.resize(1);
	m_factors[0] = vocab.AddVocabId(inString);	
}
Beispiel #13
0
    treebank_minibatch_dataset convert_trees_to_indexed_minibatches(
        const Vocab& word_vocab,
        const std::vector<AnnotatedParseTree::shared_tree>& trees,
        int minibatch_size) {
        treebank_minibatch_dataset dataset;

        auto to_index_pair = [&word_vocab](std::pair<std::vector<std::string>, uint>&& pair, bool&& is_root) {
            return std::tuple<std::vector<uint>, uint, bool>(
                word_vocab.encode(pair.first),
                pair.second,
                is_root);
        };

        if (dataset.size() == 0)
            dataset.emplace_back(0);

        for (auto& tree : trees) {

            // create new minibatch
            if (dataset[dataset.size()-1].size() == minibatch_size) {
                dataset.emplace_back(0);
                dataset.back().reserve(minibatch_size);
            }

            // add root
            dataset[dataset.size()-1].emplace_back(
                to_index_pair(
                    tree->to_labeled_pair(),
                    true
                )
            );

            // add children:
            for (auto& child : tree->general_children) {
                if (dataset[dataset.size()-1].size() == minibatch_size) {
                    dataset.emplace_back(0);
                    dataset.back().reserve(minibatch_size);
                }
                dataset[dataset.size()-1].emplace_back(
                    to_index_pair(
                        child->to_labeled_pair(),
                        false
                    )
                );
            }
        }
        return dataset;
    }
    void add_example(
            const Vocab& vocab,
            const vector<string>& example_orig,
            size_t& example_idx) {
        int len = std::min(example_orig.size(), (size_t)FLAGS_max_sentence_length);
        vector<string> example(example_orig.begin(), example_orig.begin() + len);

        auto description_length = example.size();
        this->data.w(0, example_idx) = vocab.word2index.at(START);
        auto encoded = vocab.encode(example, true);
        this->mask.w(0, example_idx) = 0.0;
        for (size_t j = 0; j < encoded.size(); j++) {
            this->data.w(j + 1, example_idx) = encoded[j];
            this->mask.w(j + 1, example_idx) = (R)1.0;
        }
        this->code_lengths[example_idx] = description_length + 1;
        this->total_codes += description_length + 1;
    }
Beispiel #15
0
void ReferenceSet::AddLine(size_t sentenceId, const StringPiece& line, Vocab& vocab)
{
    //cerr << line << endl;
    NgramCounter ngramCounts;
    list<WordVec> openNgrams;
    size_t length = 0;
    //tokenize & count
    for (util::TokenIter<util::SingleCharacter, true> j(line, util::SingleCharacter(' ')); j; ++j) {
        const Vocab::Entry* nextTok = &(vocab.FindOrAdd(*j));
        ++length;
        openNgrams.push_front(WordVec());
        for (list<WordVec>::iterator k = openNgrams.begin(); k != openNgrams.end();  ++k) {
            k->push_back(nextTok);
            ++ngramCounts[*k];
        }
        if (openNgrams.size() >=  kBleuNgramOrder) openNgrams.pop_back();
    }

    //merge into overall ngram map
    for (NgramCounter::const_iterator ni = ngramCounts.begin();
            ni != ngramCounts.end(); ++ni) {
        size_t count = ni->second;
        //cerr << *ni << " " << count <<  endl;
        if (ngramCounts_.size() <= sentenceId) ngramCounts_.resize(sentenceId+1);
        NgramMap::iterator totalsIter = ngramCounts_[sentenceId].find(ni->first);
        if (totalsIter == ngramCounts_[sentenceId].end()) {
            ngramCounts_[sentenceId][ni->first] = pair<size_t,size_t>(count,count);
        } else {
            ngramCounts_[sentenceId][ni->first].first = max(count, ngramCounts_[sentenceId][ni->first].first); //clip
            ngramCounts_[sentenceId][ni->first].second += count; //no clip
        }
    }
    //length
    if (lengths_.size() <= sentenceId) lengths_.resize(sentenceId+1);
    //TODO - length strategy - this is MIN
    if (!lengths_[sentenceId]) {
        lengths_[sentenceId] = length;
    } else {
        lengths_[sentenceId] = min(length,lengths_[sentenceId]);
    }
    //cerr << endl;

}
// Returns a vector of LiveGuessResults
// warning: words is mutated temporarily
std::auto_ptr< std::vector<LiveGuessResult> > 
forwardish(std::vector<const char *> & words, // the current words can be empty
           const double currentProb, // log prob
           const int size, // how many to grab
           const int depthLeft,
           const NgramLM & _lm, 
           const int _order,  
           const Vocab & vocab ) {
  
  // Index contains the last ngram word 



  //Logger::Log(0, "Forwardish [%d] [%d]\n", depthLeft, index);

  VocabIndex vwords[ _order ];
  //int n = (words.size() < (_order - 1))?words.size():_order;

  //for (int i = words.size() - _order - 1; i < words.size(); i++) {
  //  if ( i >= 0) {
  //    Logger::Log(0,"Word: %d %s\n",i,words[i]);
  //  }
  //}

  //vwords[0] to _order -1 are filled in
  // if it's small EndOfSentence starts it..
  for (int i = 1; i < _order; i++) {
    int j = words.size() - _order + i;
    if (j < 0) {
      vwords[i - 1] = Vocab::Invalid; // probably should be end of sentence
    } else {
      vwords[i - 1] = vocab.Find( words[ j ] );
    }
  }


  vector<VocabProb> heap(0);

  mkHeap(heap);

  const ProbVector & probabilities = _lm.probs(  _order ) ;// _order - 2  );
  const CountVector & counts = _lm.counts( _order );
  
  int count = 0;
  //Logger::Log(0, "Find probabilities %d\n",vocab.size());

  for (int j = 0; j < vocab.size(); j++) {
    VocabIndex vWordI = j;//vocab[j];
    vwords[ _order - 1 ] = j;
    NgramIndex newIndex = _lm.model()._Find( vwords, _order );
    
    if (newIndex == -1) { // not legit :(
      continue;
    }
    Prob probRaw = probabilities[ newIndex ];
    if (probRaw == 0.0) {
      continue;
    }
    Prob prob = -1 * log( probRaw ); //biggest is smallest

    //Prob prob = (probRaw == 0.0)?10000:(-1 * log( probRaw )); //biggest is smallest
    //Prob probRaw = (counts[newIndex]==0)?1.0:counts[newIndex]/vocab.size()
    //Prob prob = -1 * log(probRaw);
    //Prob prob = -1 * counts[newIndex];
    //Logger::Log(0, "Prob %e\n",prob);

    const VocabProb v( prob,j, newIndex);
    if ( count < size ) {
      heap.push_back( v );
      count++;
      if (count == size) {
        mkHeap( heap );
      }
      // this is irritating, basically it means the highest rank stuff
      // will be in the list and we only kick out the lowest ranked stuff
      // (which will be the GREATEST of what is already there)
      // 
    } else if (  heap.front().prob >  prob ) {
      // this is dumb        
      // remove the least element
      popHeap( heap );
      pushHeap( heap, v );
      // should we update?
    }
  }
  sortHeap( heap );

  std::vector<LiveGuessResult> * resVector = new std::vector<LiveGuessResult>();
  
  for( int j = 0; j < heap.size(); j++) {
    VocabProb v = heap[ j ];
    Prob prob = v.prob;
    prob += currentProb;
    const char * word = vocab[ v.index ];
    vector<const char *> ourWords(words);
    ourWords.push_back( word ); // add 
    char * str = joinVectorOfCStrings( ourWords ); // Remember to deallocate later :(
    
    /*char * str = new char[strlen(word)+1];
    CopyString(str, word); */
    resVector->push_back( LiveGuessResult( prob , str  )); 
  }
  
  if ( depthLeft <= 0 ) {

  } else {
    //Let's recurse!
    for( int j = 0; j < heap.size(); j++) {
      VocabProb v = heap[ j ];
      Prob prob = v.prob;
      prob += currentProb;
      words.push_back( vocab[ v.index ] );
      std::auto_ptr< std::vector<LiveGuessResult> > r = 
        forwardish( words, 
                    prob,
                    size,
                    depthLeft - 1,
                    _lm, 
                    _order,  
                    vocab );
      words.pop_back(); // and restore
      for (int i = 0; i < r->size(); i++) {
        resVector->push_back( (*r)[i] );
      }
    }
  }


  std::auto_ptr< std::vector<LiveGuessResult> > returnValues( resVector );
  return returnValues;
}
Beispiel #17
0
void Word::DebugPrint(ostream &out, const Vocab &vocab) const
{
 	const string &str = vocab.GetString(m_vocabId);
  out << str;
}
Beispiel #18
0
int main(int argc, char* argv[])
{
	Vocab vocab;
	Ngram lm(vocab, 2); 
	vector<string> splitLine;
	map<string, set<string> > mapping;
	map<string, set<string> >::iterator map_iter;
	vector<string> BestLine;
	vector<string>::iterator Best_iter;
	FILE * TextFile;
	FILE * MapFile;

	char ch;
	char tmpstr[BUFSIZE];

	for(int i=0 ; i<argc ; i++)
	{
		if(string("-text")==argv[i])
		{
			TextFile = fopen(argv[i+1],"r");
		}
		if(string("-map")==argv[i])
		{
			MapFile = fopen(argv[i+1], "r");
		}
		if(string("-lm")==argv[i])
		{
			File lmFile(argv[i+1],"r"); 
			lm.read(lmFile);
			lmFile.close();
		}
	}
	//read MapFile into map<string, set<string> > mapping
	while(fgets(tmpstr,4096,MapFile))
	{
		char *tok=strtok(tmpstr,"\n");
		string Key,StringTok;
		set<string> ZhuYin;
		while(tok!=NULL)
		{
			StringTok=string(tok);
			Key = StringTok.substr(0,2);//read the first ZhuYin or ChuIn to key
			int pos;
			string tmpLine=StringTok.substr(3);
			while((pos = tmpLine.find(" "))!=-1)
			{
				tmpLine.erase(pos,1);
			}
			assert(tmpLine.size()%2==0);
			for(int i=0 ; i<tmpLine.size() ; i+=2)
			{
				string buf = tmpLine.substr(i, 2);
				ZhuYin.insert(buf);
			}
			mapping[Key]=ZhuYin;
			tok = strtok(NULL,"\n");
			ZhuYin.clear();
		}
	}
	//read TextFile into vector<string> splitLine
	int line =0;
	while(fgets(tmpstr,4096,TextFile))
	{
		line++;
		char *tok=strtok(tmpstr,"\n");//Splite into one line
		string tmpLine = string(tok);    	
		while(tok!=NULL)
		{	
			int pos;
			while((pos = tmpLine.find(" "))!=-1)
			{
				tmpLine.erase(pos,1);
			}
			assert(tmpLine.size()%2==0);
			for(int i=0 ; i<tmpLine.size() ; i+=2)
			{
				string buf = tmpLine.substr(i, 2);
				splitLine.push_back(buf);//push one word to splitLine 
			}
			tok = strtok(NULL,"\n");
		}
		splitLine.push_back("\n");	
	}
	int count = 1;
	//Viterbi
	for(int i=0;i<splitLine.size();i++)
	{
		set<string> TmpSet;
		if(i==0)
		{
			//cout << count << endl;
			BestLine.push_back("<s>");
			BestLine.push_back(" ");
		}
		if(splitLine[i]=="\n")
		{
			count++;
			//cout << endl;
			//cout << count << endl;
			BestLine.push_back("</s>");
			BestLine.push_back("\n"); 
			BestLine.push_back("<s>");
			BestLine.push_back(" ");
		}
		else
		{
			//cout <<  splitLine[i];//print every line without space
			map_iter = mapping.find(splitLine.at(i));//find the splitline[i] in mapping to map_iter
			//cout << (map_iter -> first).c_str() << endl;
			set<string> EveryPossibleZhuYin = map_iter -> second;
			//all possible ZhuYin in ChuIn
			set<string>::iterator iBegin=EveryPossibleZhuYin.begin();
			string PreString;

			for(iBegin;iBegin!=EveryPossibleZhuYin.end();++iBegin)
			{
				//cout << *iBegin->c_str() <<endl;
			}
			//sleep(1);
			string TempString;
			double maxProb = -1000.0;
			VocabIndex wid;
			string best;
			for(set<string>::iterator i=EveryPossibleZhuYin.begin();i!=EveryPossibleZhuYin.end();++i)
			{
				TempString = *i;
				//cout << TempString.c_str() << endl;

				//sleep(1);
				VocabIndex context[]={vocab.getIndex(PreString.c_str()),Vocab_None};
				wid=vocab.getIndex(TempString.c_str());
				if(wid == Vocab_None) 
				{
					//printf("Not in bigram.lm\n");
				}
				else
				{
					double pro=lm.wordProb(wid,context);

					if(pro>maxProb)
					{	
						best = TempString;
						maxProb=pro;
					}
				}
			}
			PreString=best;
			BestLine.push_back(PreString);
			BestLine.push_back(" ");

		}
	}
	for(vector<string>::iterator i=BestLine.begin();i!=(BestLine.end()-2);++i)
	{
		cout << *i ;
	}

}
Beispiel #19
0
void ModelTrain()
{
	Vocab vocab;
	vocab.LoadVocab("l3g.txt");
	cout << "vocab Size " << vocab.VocabSize << endl;
	vector < tuple <int *, int > > src_batch, tgt_batch;
	extractBinaryfromStream("data//train_data_40k.tsv", vocab, src_batch, tgt_batch, 1, 0);

	int sampleSize = src_batch.size();
	cout << "train sample size" << sampleSize << endl;

	int iteration = 30;
	int miniBatchSize = 1024;
	int featureDim = vocab.VocabSize;
	int batchNum = sampleSize / miniBatchSize;
	int nTrial = 4;

	vector <int> shuff(sampleSize);

	RunnerBehavior rb;
	rb.RunMode = RUNMODE_TRAIN;
	rb.Device = DEVICE_GPU;
	cout<<"init cuda computation ...."<<endl;
	rb.ComputeLib = new CudaOperationManager(true, true);
	
	cout<<"init cuda computation done"<<endl;
	
	int hiddenDim1 = 128;
	int hiddenDim2 = 128;

	SparseIndexMatrixStat srcMiniBatchInfo;
	srcMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize;
	srcMiniBatchInfo.MAX_COL_SIZE = featureDim;
	srcMiniBatchInfo.TOTAL_BATCH_NUM = batchNum;
	srcMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize;
	srcMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256;

	SparseIndexMatrixStat tgtMiniBatchInfo;
	tgtMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize;
	tgtMiniBatchInfo.MAX_COL_SIZE = featureDim;
	tgtMiniBatchInfo.TOTAL_BATCH_NUM = batchNum;
	tgtMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize;
	tgtMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256;

	DenseMatrixStat OutputLayer1Info;
	OutputLayer1Info.MAX_ROW_SIZE = miniBatchSize;
	OutputLayer1Info.MAX_COL_SIZE = hiddenDim1;
	OutputLayer1Info.TOTAL_BATCH_NUM = batchNum;
	OutputLayer1Info.TOTAL_SAMPLE_NUM = sampleSize;


	DenseMatrixStat OutputLayer2Info;
	OutputLayer2Info.MAX_ROW_SIZE = miniBatchSize;
	OutputLayer2Info.MAX_COL_SIZE = hiddenDim2;
	OutputLayer2Info.TOTAL_BATCH_NUM = batchNum;
	OutputLayer2Info.TOTAL_SAMPLE_NUM = sampleSize;


	FullyConnectedLayer srcLayer1(featureDim, hiddenDim1, &rb);
	FullyConnectedLayer srcLayer2(hiddenDim1, hiddenDim2, &rb);

	FullyConnectedLayer tgtLayer1(featureDim, hiddenDim1, &rb);
	FullyConnectedLayer tgtLayer2(hiddenDim1, hiddenDim2, &rb);

	DenseMatrixStat OutputSimInfo;
	OutputSimInfo.MAX_ROW_SIZE = miniBatchSize;
	OutputSimInfo.MAX_COL_SIZE = 1 + nTrial;
	OutputSimInfo.TOTAL_BATCH_NUM = batchNum;
	OutputSimInfo.TOTAL_SAMPLE_NUM = sampleSize;

	SparseIndexMatrix srcBatch(&srcMiniBatchInfo, rb.Device);	
	HiddenDenseMatrix srcLayer1Data(&OutputLayer1Info, rb.Device);
	HiddenDenseMatrix srcLayer2Data(&OutputLayer2Info, rb.Device);

	SparseIndexMatrix tgtBatch(&tgtMiniBatchInfo, rb.Device);
	HiddenDenseMatrix tgtLayer1Data(&OutputLayer1Info, rb.Device);
	HiddenDenseMatrix tgtLayer2Data(&OutputLayer2Info, rb.Device);

	BiMatchData biMatchData(miniBatchSize, nTrial, rb.Device);

	SimilarityRunner similarityRunner(10, &rb);
	HiddenDenseMatrix simOutput(&OutputSimInfo, rb.Device);
	HiddenDenseMatrix probOutput(&OutputSimInfo, rb.Device);

	probOutput.Deriv->Data->Zero();
	
	//iteration = 1;
	cout<<"start training iteration"<<endl;
	
	double train_time = 0;
	double io_time = 0;
		
	struct timeval train_start, train_end;
	struct timeval io_start, io_end;
	
	gettimeofday(&train_start, 0);
	
	for (int iter = 0; iter<iteration; iter++)
	{
		for (int i = 0; i<sampleSize; i++) shuff[i] = i;

		int shuffIdx = 0;

		float avgLoss = 0;
		for (int b = 0; b<batchNum; b++)
		{
			gettimeofday(&io_start, 0);

			srcBatch.Refresh();
			tgtBatch.Refresh();

			while (shuffIdx < sampleSize - 1 && srcBatch.RowSize < miniBatchSize && tgtBatch.RowSize < miniBatchSize)
			{
				int p = shuffIdx + rand() % (sampleSize - shuffIdx);
				int smpIdx = shuff[p];
				shuff[p] = shuff[shuffIdx];
				shuff[shuffIdx] = smpIdx;
				shuffIdx += 1;

				srcBatch.PushSample(get<0>(src_batch[smpIdx]), get<1>(src_batch[smpIdx]));
				tgtBatch.PushSample(get<0>(tgt_batch[smpIdx]), get<1>(tgt_batch[smpIdx]));
			}
			
			gettimeofday(&io_end, 0);
			
			io_time += io_end.tv_sec - io_start.tv_sec;
			
			
			//cout<<"src batch row "<< srcBatch.RowSize<<endl;
			//cout<<"src element size " <<srcBatch.ElementSize<<endl; 
			//cout<<"tgt batch row "<< tgtBatch.RowSize<<endl;
			//cout<<"tgt element size " <<tgtBatch.ElementSize<<endl; 
			
			//srcLayer1.Weight->SyncToHost(0, 100);
			//tgtLayer1.Weight->SyncToHost(0, 100);
			
			//for(int i=0;i<100;i++)
			//{
			 //   cout<<"smpIdx "<< src.Weight->HostMem[i]<<endl;
			//}
			
			
			
			//cout<<"src weight "<<srcLayer1.Weight->HostMem[0]<<endl;
			//cout<<"tgt weight "<<tgtLayer1.Weight->HostMem[0]<<endl;
			
			//for(int i = 0; i< srcBatch.ElementSize; i++)
			//{
			//	srcBatch.SampleIdx
			//}
			//if( cudaSuccess != cudaGetLastError())
			//	cout <<"error 1"<<endl;
			
			
			srcLayer1.Forward(&srcBatch, srcLayer1Data.Output);
			//if( cudaSuccess != cudaGetLastError())
			//	cout <<"fdsfasdf"<<endl;
			//srcLayer1Data.Output->Data->SyncToHost(0,100);
			//cout<<"src 1 output"<<srcLayer1Data.Output->Data->HostMem[0]<<endl;
			
			srcLayer2.Forward(srcLayer1Data.Output, srcLayer2Data.Output);

			tgtLayer1.Forward(&tgtBatch, tgtLayer1Data.Output);
			tgtLayer2.Forward(tgtLayer1Data.Output, tgtLayer2Data.Output);
			
			biMatchData.GenerateMatch(srcBatch.RowSize);
			
			//srcLayer2Data.Output->Data->SyncToHost(0, srcLayer2Data.Stat->MAX_COL_SIZE * srcBatch.RowSize);
			//tgtLayer2Data.Output->Data->SyncToHost(0, tgtLayer2Data.Stat->MAX_COL_SIZE * tgtBatch.RowSize);
			
			//cout<<"src output"<<srcLayer2Data.Output->Data->HostMem[0]<<endl;
			//cout<<"tgt output"<<tgtLayer2Data.Output->Data->HostMem[0]<<endl;
			
			similarityRunner.Forward(srcLayer2Data.Output, tgtLayer2Data.Output, &biMatchData, simOutput.Output);

			//simOutput.Output->Data->SyncToHost(0, srcBatch.RowSize * 5);
			//for(int i=0;i<srcBatch.RowSize;i++)
			//{
			//	cout<<"sim"<< simOutput.Output->Data->HostMem[i]<<endl;
			//	break;
			//}
			//break;				
			rb.ComputeLib->SoftmaxForward(simOutput.Output->Data, probOutput.Output->Data, srcBatch.RowSize, simOutput.Stat->MAX_COL_SIZE);
			/// log softmax backward.  probOutput.Deriv->Data  --> biMatchData.MatchInfo
			rb.ComputeLib->VecAdd(probOutput.Output->Data, -1, biMatchData.MatchInfo, 1, simOutput.Deriv->Data, 0, biMatchData.MatchSize);

			//rb.ComputeLib->SoftmaxBackward(probOutput.Output->Data, probOutput.Deriv->Data, simOutput.Deriv->Data, srcBatch.RowSize, probOutput.Stat->MAX_COL_SIZE);
			/// output Loss.
			float loss = 0;
			//simOutput.Output->Data->QuickWatch();
			//simOutput.Deriv->Data->QuickWatch();
			probOutput.Output->Data->SyncToHost(0, srcBatch.RowSize * probOutput.Stat->MAX_COL_SIZE); //  ->QuickWatch();
			//probOutput.Deriv->Data->QuickWatch();
			for(int i=0;i< srcBatch.RowSize; i++)
			{
				//cout<< probOutput.Output->Data->HostMem[i * probOutput.Stat->MAX_COL_SIZE]<<endl;

				loss += logf(probOutput.Output->Data->HostMem[i * probOutput.Stat->MAX_COL_SIZE] + LARGEEPS);
			}
			loss = loss / srcBatch.RowSize;
			avgLoss = b * 1.0f / (b + 1) * avgLoss + 1.0f / (b + 1) * loss;

			if((b+1) % 10 == 0) cout<<"mini batch : "<<b+1<<"\t avg loss :"<<avgLoss<<endl;
			//cout<<"current loss "<<loss<<endl;
			similarityRunner.Backward(simOutput.Deriv, srcLayer2Data.Deriv, tgtLayer2Data.Deriv);


			tgtLayer2.Backward(tgtLayer2Data.Deriv, tgtLayer2Data.Output, tgtLayer1Data.Deriv);
			tgtLayer1.Backward(tgtLayer1Data.Deriv, tgtLayer1Data.Output);

			srcLayer2.Backward(srcLayer2Data.Deriv, srcLayer2Data.Output, srcLayer1Data.Deriv);
			srcLayer1.Backward(srcLayer1Data.Deriv, srcLayer1Data.Output);

			/// update.
			tgtLayer2.Update(tgtLayer2Data.Deriv, tgtLayer1Data.Output);
			tgtLayer1.Update(tgtLayer1Data.Deriv, &tgtBatch);

			srcLayer2.Update(srcLayer2Data.Deriv, srcLayer1Data.Output);
			srcLayer1.Update(srcLayer1Data.Deriv, &srcBatch);
		}
		cout<<"iteration : "<<iter + 1<<"\t avg loss :"<<avgLoss<<endl;

	}
	
	gettimeofday(&train_end, 0);
	
	train_time = (train_end.tv_sec - train_start.tv_sec);
			
	cout<<"train overall time elipsed (sec):"<<train_time<<endl;
	cout<<"io time elipsed (sec):"<<io_time<<endl;
	cout<<"gpu time elipsed (sec):"<<train_time - io_time<<endl;
	ofstream modelWriter;
	modelWriter.open("model//dssm.v2.model", ofstream::binary);
	srcLayer1.Serialize(modelWriter);
	srcLayer2.Serialize(modelWriter);
	tgtLayer1.Serialize(modelWriter);
	tgtLayer2.Serialize(modelWriter);
	modelWriter.close();
}
Beispiel #20
0
void ModelPredict()
{
	Vocab vocab;
	vocab.LoadVocab("l3g.txt");
	cout << "vocab Size " << vocab.VocabSize << endl;
	vector < tuple <int *, int > > src_batch, tgt_batch;
	extractBinaryfromStream("data//test_data_clean.tsv", vocab, src_batch, tgt_batch, 0, 0);

	int sampleSize = src_batch.size();
	cout << "test sample size" << sampleSize << endl;

	int miniBatchSize = 1024;
	int featureDim = vocab.VocabSize;
	int batchNum = (sampleSize - 1) / miniBatchSize + 1;

	RunnerBehavior rb;
	rb.RunMode = RUNMODE_PREDICT;
	rb.Device = DEVICE_GPU;

	rb.ComputeLib = new CudaOperationManager(true, true);
	int hiddenDim1 = 128;
	int hiddenDim2 = 128;

	SparseIndexMatrixStat srcMiniBatchInfo;
	srcMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize;
	srcMiniBatchInfo.MAX_COL_SIZE = featureDim;
	srcMiniBatchInfo.TOTAL_BATCH_NUM = batchNum;
	srcMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize;
	srcMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256;

	SparseIndexMatrixStat tgtMiniBatchInfo;
	tgtMiniBatchInfo.MAX_ROW_SIZE = miniBatchSize;
	tgtMiniBatchInfo.MAX_COL_SIZE = featureDim;
	tgtMiniBatchInfo.TOTAL_BATCH_NUM = batchNum;
	tgtMiniBatchInfo.TOTAL_SAMPLE_NUM = sampleSize;
	tgtMiniBatchInfo.MAX_ELEMENT_SIZE = miniBatchSize * 256;

	DenseMatrixStat OutputLayer1Info;
	OutputLayer1Info.MAX_ROW_SIZE = miniBatchSize;
	OutputLayer1Info.MAX_COL_SIZE = hiddenDim1;
	OutputLayer1Info.TOTAL_BATCH_NUM = batchNum;
	OutputLayer1Info.TOTAL_SAMPLE_NUM = sampleSize;

	DenseMatrixStat OutputLayer2Info;
	OutputLayer2Info.MAX_ROW_SIZE = miniBatchSize;
	OutputLayer2Info.MAX_COL_SIZE = hiddenDim2;
	OutputLayer2Info.TOTAL_BATCH_NUM = batchNum;
	OutputLayer2Info.TOTAL_SAMPLE_NUM = sampleSize;

	ifstream modelReader;
	modelReader.open("model//dssm.v2.model", ofstream::binary);
	FullyConnectedLayer srcLayer1(modelReader, &rb);
	FullyConnectedLayer srcLayer2(modelReader, &rb);
	FullyConnectedLayer tgtLayer1(modelReader, &rb);
	FullyConnectedLayer tgtLayer2(modelReader, &rb);
	modelReader.close();

	DenseMatrixStat OutputSimInfo;
	OutputSimInfo.MAX_ROW_SIZE = miniBatchSize;
	OutputSimInfo.MAX_COL_SIZE = 1;
	OutputSimInfo.TOTAL_BATCH_NUM = batchNum;
	OutputSimInfo.TOTAL_SAMPLE_NUM = sampleSize;

	SparseIndexMatrix srcBatch(&srcMiniBatchInfo, rb.Device);	
	HiddenDenseMatrix srcLayer1Data(&OutputLayer1Info, rb.Device);
	HiddenDenseMatrix srcLayer2Data(&OutputLayer2Info, rb.Device);

	SparseIndexMatrix tgtBatch(&tgtMiniBatchInfo, rb.Device);
	HiddenDenseMatrix tgtLayer1Data(&OutputLayer1Info, rb.Device);
	HiddenDenseMatrix tgtLayer2Data(&OutputLayer2Info, rb.Device);

	BiMatchData biMatchData(miniBatchSize, 0, rb.Device);

	SimilarityRunner similarityRunner(10, &rb);
	HiddenDenseMatrix simOutput(&OutputSimInfo, rb.Device);
	HiddenDenseMatrix probOutput(&OutputSimInfo, rb.Device);

	ofstream outfile;
	outfile.open("data//test_data.v2.result", ofstream::out);

	int smpIdx = 0;

	for (int b = 0; b<batchNum; b++)
	{
		srcBatch.Refresh();
		tgtBatch.Refresh();

		while (smpIdx < sampleSize && srcBatch.RowSize < miniBatchSize && tgtBatch.RowSize < miniBatchSize)
		{
			srcBatch.PushSample(get<0>(src_batch[smpIdx]), get<1>(src_batch[smpIdx]));
			tgtBatch.PushSample(get<0>(tgt_batch[smpIdx]), get<1>(tgt_batch[smpIdx]));
			smpIdx++;
		}

		srcLayer1.Forward(&srcBatch, srcLayer1Data.Output);
		srcLayer2.Forward(srcLayer1Data.Output, srcLayer2Data.Output);

		tgtLayer1.Forward(&tgtBatch, tgtLayer1Data.Output);
		tgtLayer2.Forward(tgtLayer1Data.Output, tgtLayer2Data.Output);

		biMatchData.GenerateMatch(srcBatch.RowSize);

		similarityRunner.Forward(srcLayer2Data.Output, tgtLayer2Data.Output, &biMatchData, simOutput.Output);

		simOutput.Output->Data->QuickWatch();

		//probOutput.Deriv->Data->QuickWatch();
		for(int i=0;i< srcBatch.RowSize; i++)
			outfile<<simOutput.Output->Data->HostMem[i]<<endl;
		//cout<<srcBatch.RowSize<<"\t"<<smpIdx<<endl;
		
		if((b+1) % 10 == 0) cout<<"mini batch : "<<b+1<<" sample number "<<smpIdx<<endl;
	}
	outfile.close();
}
Beispiel #21
0
void
LinkableValueNode::set_children_vocab(const Vocab &newvocab)
{
	children_vocab.assign(newvocab.begin(),newvocab.end());
}
void PackedTrie::InitializeFromCorpus(const vector<const ParallelCorpus*>& pcs,
                                      const Vocab& total_source_vocab,
                                      const Vocab& total_target_vocab) {
  is_dependent_ = false;
  source_count_ = total_source_vocab.size();
  target_count_ = total_target_vocab.size();

  // For each source word, keep track of how many possible target words it can
  // generate. The null word can generate anything.
  vector<unordered_set<int> > targets_per_source;
  targets_per_source.resize(source_count_);
  // Each source word can generate the OOV word, which always has id 0.
  for (int s = 0; s < source_count_; ++s) {
    targets_per_source[s].insert(0);
  }
  for (int i = 0; i < pcs.size(); ++i) {
    for (int j = 0; j < pcs.at(i)->size(); ++j) {
      // Put all of the source and target words in the document pair into sets.
      const DocumentPair& doc_pair = pcs.at(i)->GetDocPair(j);
      
      unordered_set<int> source_words, target_words;
      for (int t = 0; t < doc_pair.second.size(); ++t) {
        const Sentence& sentence = doc_pair.second[t];
        for (int w = 0; w < sentence.size(); ++w) {
          target_words.insert(sentence[w]);
        }
      }
      for (int s = 0; s < doc_pair.first.size(); ++s) {
        const Sentence& sentence = doc_pair.first[s];
        for (int w = 0; w < sentence.size(); ++w) {
          source_words.insert(sentence[w]);
        }
      }
      unordered_set<int>::const_iterator s_it, t_it;
      for (s_it = source_words.begin(); s_it != source_words.end(); ++s_it) {
        for (t_it = target_words.begin(); t_it != target_words.end(); ++t_it) {
          targets_per_source[*s_it].insert(*t_it);
        }
      }
    }
  }
  assert(targets_per_source.size() > 0);

  offsets_ = new int[source_count_ + 1];

  // Null word
  offsets_[0] = 0;
  total_size_ = target_count_;
  for (int i = 1; i < targets_per_source.size(); ++i) {
    offsets_[i] = total_size_;
    total_size_ += targets_per_source[i].size();
  }
  offsets_[source_count_] = total_size_;

  target_words_ = new int[total_size_];
  data_ = new double[total_size_];

  // Add the entries
  // Null word entries
  for (int i = 0; i < target_count_; ++i) {
    target_words_[i] = i;
    data_[i] = log(1.0 / target_count_);
  }
  for (int s = 1; s < targets_per_source.size(); ++s) {
    int index = offsets_[s];
    unordered_set<int>::const_iterator it = targets_per_source[s].begin();
    for ( ; it != targets_per_source[s].end(); ++it) {
      target_words_[index] = *it;
      data_[index] = log(1.0 / targets_per_source[s].size());
      ++index;
    }
    qsort(target_words_ + offsets_[s],
          offsets_[s+1] - offsets_[s],
          sizeof(int),
          PackedTrie::int_cmp);
  }
  for (int s = 0; s < source_count_; ++s) {
    int last = -1;
    //cout << "Source word: " << total_source_vocab.GetWord(s) << endl;
    for (int i = offsets_[s]; i < offsets_[s+1]; ++i) {
      //cout << total_target_vocab.GetWord(target_words_[i]) << ":"
      //     << target_words_[i] << ":" << data_[i] << " ";
      if (last >= target_words_[i]) {
        cout << "Error on source word " << s << endl;
      }
      last = target_words_[i];
    }
    //cout << endl;
  }
}