Ejemplo n.º 1
0
void Data::sampleRankedPairs( const std::string &rankedpairfile ) {
	cout << "Sampling ranked pairs." << endl;

	ofstream *outFile = new ofstream();
	outFile->open( rankedpairfile.c_str() );
	ostream *out = outFile;

	const unsigned int n_samplings = 5000;
	const unsigned int n_samples = 50;
	const float min_diff = 0.05;

	// loop over all sentences
  for(unsigned int S=0; S<featdata->size(); S++) {
		unsigned int n_translations = featdata->get(S).size();
		// sample a fixed number of times
		vector< SampledPair* > samples;
		vector< float > scores;
		for(unsigned int i=0; i<n_samplings; i++) {
			unsigned int translation1 = rand() % n_translations;
			float bleu1 = sentenceLevelBleuPlusOne(scoredata->get(S,translation1));

			unsigned int translation2 = rand() % n_translations;
			float bleu2 = sentenceLevelBleuPlusOne(scoredata->get(S,translation2));
			
			if (abs(bleu1-bleu2) < min_diff)
				continue;
			
			samples.push_back( new SampledPair( translation1, translation2, bleu1-bleu2) );
			scores.push_back( 1.0 - abs(bleu1-bleu2) );
		}
		//cerr << "sampled " << samples.size() << " pairs\n";

		float min_diff = -1.0;
		if (samples.size() > n_samples) {
			nth_element(scores.begin(), scores.begin()+(n_samples-1), scores.end());
			min_diff = 0.99999-scores[n_samples-1];
			//cerr << "min_diff = " << min_diff << endl;
		}

		unsigned int collected = 0;
		for(unsigned int i=0; i<samples.size() && collected < n_samples; i++) {
			if (samples[i]->getDiff() >= min_diff) {
				collected++;

				*out << "1";
        outputSample( *out, featdata->get(S,samples[i]->getTranslation1()),
                            featdata->get(S,samples[i]->getTranslation2()) );
        *out << endl;
				*out << "0";
        outputSample( *out, featdata->get(S,samples[i]->getTranslation2()),
                            featdata->get(S,samples[i]->getTranslation1()) );
        *out << endl;
			}
			delete samples[i];
		}
		//cerr << "collected " << collected << endl;
	}
	out->flush();
	outFile->close();
}
Ejemplo n.º 2
0
vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile) {
    vector<string> scoreFiles;
    vector<string> featureFiles;
    scoreFiles.push_back(scoreFile);
    featureFiles.push_back(featureFile);

    vector<FeatureDataIterator> featureDataIters;
    vector<ScoreDataIterator> scoreDataIters;
    for (size_t i = 0; i < featureFiles.size(); ++i) {
        featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
        scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
    }

    vector<pair<size_t,size_t> > hypotheses;
    if (featureDataIters[0] == FeatureDataIterator::end()) {
        cerr << "Error: at the end of feature data iterator" << endl;
        exit(1);
    }
    for (size_t i = 0; i < featureFiles.size(); ++i) {
        if (featureDataIters[i] == FeatureDataIterator::end()) {
            cerr << "Error: Feature file " << i << " ended prematurely" << endl;
            exit(1);
        }
        if (scoreDataIters[i] == ScoreDataIterator::end()) {
            cerr << "Error: Score file " << i << " ended prematurely" << endl;
            exit(1);
        }
        if (featureDataIters[i]->size() != scoreDataIters[i]->size()) {
            cerr << "Error: features and scores have different size" << endl;
            exit(1);
        }
        for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
            hypotheses.push_back(pair<size_t,size_t>(i,j));
        }
    }

    // score the nbest list
    vector<float> bleuScores;
    for (size_t i=0; i < hypotheses.size(); ++i) {
        pair<size_t,size_t> translation = hypotheses[i];
        float bleu = sentenceLevelBleuPlusOne(scoreDataIters[translation.first]->operator[](translation.second));
        bleuScores.push_back(bleu);
    }
    return bleuScores;
}