void Data::sampleRankedPairs( const std::string &rankedpairfile ) { cout << "Sampling ranked pairs." << endl; ofstream *outFile = new ofstream(); outFile->open( rankedpairfile.c_str() ); ostream *out = outFile; const unsigned int n_samplings = 5000; const unsigned int n_samples = 50; const float min_diff = 0.05; // loop over all sentences for(unsigned int S=0; S<featdata->size(); S++) { unsigned int n_translations = featdata->get(S).size(); // sample a fixed number of times vector< SampledPair* > samples; vector< float > scores; for(unsigned int i=0; i<n_samplings; i++) { unsigned int translation1 = rand() % n_translations; float bleu1 = sentenceLevelBleuPlusOne(scoredata->get(S,translation1)); unsigned int translation2 = rand() % n_translations; float bleu2 = sentenceLevelBleuPlusOne(scoredata->get(S,translation2)); if (abs(bleu1-bleu2) < min_diff) continue; samples.push_back( new SampledPair( translation1, translation2, bleu1-bleu2) ); scores.push_back( 1.0 - abs(bleu1-bleu2) ); } //cerr << "sampled " << samples.size() << " pairs\n"; float min_diff = -1.0; if (samples.size() > n_samples) { nth_element(scores.begin(), scores.begin()+(n_samples-1), scores.end()); min_diff = 0.99999-scores[n_samples-1]; //cerr << "min_diff = " << min_diff << endl; } unsigned int collected = 0; for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) { if (samples[i]->getDiff() >= min_diff) { collected++; *out << "1"; outputSample( *out, featdata->get(S,samples[i]->getTranslation1()), featdata->get(S,samples[i]->getTranslation2()) ); *out << endl; *out << "0"; outputSample( *out, featdata->get(S,samples[i]->getTranslation2()), featdata->get(S,samples[i]->getTranslation1()) ); *out << endl; } delete samples[i]; } //cerr << "collected " << collected << endl; } out->flush(); outFile->close(); }
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) { vector<string> scoreFiles; vector<string> featureFiles; scoreFiles.push_back(scoreFile); featureFiles.push_back(featureFile); vector<FeatureDataIterator> featureDataIters; vector<ScoreDataIterator> scoreDataIters; for (size_t i = 0; i < featureFiles.size(); ++i) { featureDataIters.push_back(FeatureDataIterator(featureFiles[i])); scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i])); } vector<pair<size_t,size_t> > hypotheses; if (featureDataIters[0] == FeatureDataIterator::end()) { cerr << "Error: at the end of feature data iterator" << endl; exit(1); } for (size_t i = 0; i < featureFiles.size(); ++i) { if (featureDataIters[i] == FeatureDataIterator::end()) { cerr << "Error: Feature file " << i << " ended prematurely" << endl; exit(1); } if (scoreDataIters[i] == ScoreDataIterator::end()) { cerr << "Error: Score file " << i << " ended prematurely" << endl; exit(1); } if (featureDataIters[i]->size() != scoreDataIters[i]->size()) { cerr << "Error: features and scores have different size" << endl; exit(1); } for (size_t j = 0; j < featureDataIters[i]->size(); ++j) { hypotheses.push_back(pair<size_t,size_t>(i,j)); } } // score the nbest list vector<float> bleuScores; for (size_t i=0; i < hypotheses.size(); ++i) { pair<size_t,size_t> translation = hypotheses[i]; float bleu = sentenceLevelBleuPlusOne(scoreDataIters[translation.first]->operator[](translation.second)); bleuScores.push_back(bleu); } return bleuScores; }