void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_lengths.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); // Calculate correct, output_length and ref_length for // the line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); multiset<int> testtokens_all(testtokens.begin(),testtokens.end()); set<int> testtokens_unique(testtokens.begin(),testtokens.end()); int correct = 0; for (set<int>::iterator i = testtokens_unique.begin(); i != testtokens_unique.end(); ++i) { int token = *i; correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token)); } ostringstream stats; stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ; string stats_str = stats.str(); entry.set(stats_str); }
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { stringstream buff; string align = text; string sentence = ""; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } int i = 0; for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) { ScoreStats tempEntry; if ((*itsc)->useAlignment()) { (*itsc)->prepareStats(sid, text, tempEntry); } else { (*itsc)->prepareStats(sid, sentence, tempEntry); } if (i > 0) buff << " "; buff << tempEntry; i++; } //cout << " Scores for interpolated: " << buff << endl; string str = buff.str(); entry.set(str); }
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_references.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } NgramCounts testcounts; // stats for this line vector<ScoreStatsType> stats(kBleuNgramOrder * 2); string sentence = preprocessSentence(text); const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true); const int reference_len = CalcReferenceLength(sid, length); stats.push_back(reference_len); //precision on each ngram type for (NgramCounts::const_iterator testcounts_it = testcounts.begin(); testcounts_it != testcounts.end(); ++testcounts_it) { const NgramCounts::Value guess = testcounts_it->second; const size_t len = testcounts_it->first.size(); NgramCounts::Value correct = 0; NgramCounts::Value v = 0; if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) { correct = min(v, guess); } stats[len * 2 - 2] += correct; stats[len * 2 - 1] += guess; } entry.set(stats); }
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { string sentence = this->preprocessSentence(text); vector<ScoreStatsType> stats; prepareStatsVector(sid, sentence, stats); entry.set(stats); }
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry ) { terAlignment result; result.numEdits = 0.0 ; result.numWords = 0.0 ; result.averageWords = 0.0; for ( int incRefs = 0; incRefs < ( int ) m_multi_references.size(); incRefs++ ) { if ( sid >= m_multi_references.at(incRefs).size() ) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error ( msg.str() ); } vector<int> testtokens; vector<int> reftokens; reftokens = m_multi_references.at ( incRefs ).at ( sid ); double averageLength=0.0; for ( int incRefsBis = 0; incRefsBis < ( int ) m_multi_references.size(); incRefsBis++ ) { if ( sid >= m_multi_references.at(incRefsBis).size() ) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error ( msg.str() ); } averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size(); } averageLength=averageLength/( double ) m_multi_references.size(); TokenizeAndEncode(text, testtokens); terCalc * evaluation=new terCalc(); evaluation->setDebugMode ( false ); terAlignment tmp_result = evaluation->TER ( reftokens, testtokens ); tmp_result.averageWords=averageLength; if ( ( result.numEdits == 0.0 ) && ( result.averageWords == 0.0 ) ) { result = tmp_result; } else if ( result.scoreAv() > tmp_result.scoreAv() ) { result = tmp_result; } delete evaluation; } ostringstream stats; // multiplication by 100 in order to keep the average precision // in the TER calculation. stats << result.numEdits*100.0 << " " << result.averageWords*100.0 << " " << result.scoreAv()*100.0 << " " ; string stats_str = stats.str(); entry.set ( stats_str ); }
void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_trees.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); // if sentence has '|||', assume that tree is in second position (n-best-list); // otherwise, assume it is in first position (calling 'evaluate' with tree as reference) util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||")); ++it; if (it) { sentence = it->as_string(); } TreePointer tree (boost::make_shared<InternalTree>(sentence)); vector<map<string, int> > hwc_test (kHwcmOrder); vector<string> history(kHwcmOrder); extractHeadWordChain(tree, history, hwc_test); ostringstream stats; for (size_t i = 0; i < kHwcmOrder; i++) { int correct = 0; int test_total = 0; for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) { test_total += it->second; map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first); if (it2 != m_ref_hwc[sid][i].end()) { correct += std::min(it->second, it2->second); } } stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ; } string stats_str = stats.str(); entry.set(stats_str); }
void SerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_sentences[0].size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); int n_ser = 1; // if we don't find a reference of the same length, the error is 1 // Check whether the guessed text is equal to the reference // for this line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const sent_t& ref = m_ref_sentences[rid][sid]; // we can only have a perfect match if the sentence length is equal if (testtokens.size() == ref.size()) { int errors = 0; for (size_t tid = 0; tid < testtokens.size(); tid++) { // token mismatch: error 1 w.r.t. this reference; move to next ref. if (ref[tid] != testtokens[tid]) { errors = 1; break; } } if (errors == 0) { n_ser = 0; break; } } } ostringstream stats; stats << n_ser << " " << 1; // sentence error (0 or 1), number of sentences (1) string stats_str = stats.str(); entry.set(stats_str); }
void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { vector<ScoreStatsType> stats; const string& sentence = preprocessSentence(text); str_sentence_t splitCandSentence; splitSentence(sentence, splitCandSentence); sentence_t encodedCandSentence; encodeSentence(splitCandSentence, encodedCandSentence); if (m_ref_sentences.size() == 1) { stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]); } else { float max = -1.0f; for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]); if (m_ovr->calculateScore(tmp) > max) { stats = tmp; } } } entry.set(stats); }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //cout << "*******prepareStats" ; //cout << text << endl; //cout << sid << endl; //cout << "Reference0align:" << endl; //m_referencePerms[0][sid].dump(); string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line string standardFormat = Permutation::convertMosesToStandard(align); Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 ostringstream tempStream; tempStream.precision(SCORE_PRECISION); tempStream << distanceValue; string str = tempStream.str(); entry.set(str); //cout << tempStream.str(); }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //bool debug= (verboselevel()>3); // TODO: fix verboselevel() bool debug=false; if (debug) { cout << "*******prepareStats" ; cout << text << endl; cout << sid << endl; cout << "Reference0align:" << endl; m_referencePerms[0][sid].dump(); } string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line //here we check if the alignments extracted from the nbest are phrase-based or word-based, in which case no conversion is needed bool isWordAlignment=true; string alignCopy = align; string align1; getNextPound(alignCopy,align1," "); if (align1.length() > 0) { size_t phraseDelimeter = align1.find("="); if(phraseDelimeter!= string::npos) isWordAlignment=false; } string standardFormat = align; if(!isWordAlignment) standardFormat= Permutation::convertMosesToStandard(align); if (debug) { cerr << "Nbest alignment: " << align << endl; cerr << "-->std alignment: " << standardFormat << endl; } Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 distanceValue*=SCORE_MULTFACT; //SCOREROUT eg: 4546 to transform float into integer ostringstream tempStream; tempStream.precision(0); // decimal precision not needed as score was multiplied per SCORE_MULTFACT tempStream << std::fixed << distanceValue << " 1"; //use for final normalization over the amount of test sentences string str = tempStream.str(); entry.set(str); //cout << distanceValue << "=" << distanceValue << " (str:" << tempStream.str() << ")" << endl; }
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { // cerr << text << endl; // cerr << sid << endl; //dump_counts(*m_ref_counts[sid]); if (sid >= m_ref_counts.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } counts_t testcounts; //stats for this line vector<float> stats(kLENGTH*2);; size_t length = countNgrams(text,testcounts,kLENGTH); //dump_counts(testcounts); if (m_ref_length_type == SHORTEST) { //cerr << reflengths.size() << " " << sid << endl; int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end()); stats.push_back(shortest); } else if (m_ref_length_type == AVERAGE) { int total = 0; for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) { total += m_ref_lengths[sid][i]; } const float mean = static_cast<float>(total) / m_ref_lengths[sid].size(); stats.push_back(mean); } else if (m_ref_length_type == CLOSEST) { int min_diff = INT_MAX; int min_idx = 0; for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) { const int reflength = m_ref_lengths[sid][i]; const int diff = reflength - static_cast<int>(length); const int absolute_diff = abs(diff) - abs(min_diff); if (absolute_diff < 0) { //look for the closest reference min_diff = diff; min_idx = i; } else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) { min_idx = i; } } } stats.push_back(m_ref_lengths[sid][min_idx]); } else { throw runtime_error("Unsupported reflength strategy"); } //cerr << "computed length" << endl; //precision on each ngram type for (counts_iterator testcounts_it = testcounts.begin(); testcounts_it != testcounts.end(); ++testcounts_it) { counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first); int correct = 0; int guess = testcounts_it->second; if (refcounts_it != m_ref_counts[sid]->end()) { correct = min(refcounts_it->second,guess); } size_t len = testcounts_it->first.size(); stats[len*2-2] += correct; stats[len*2-1] += guess; } stringstream sout; copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," ")); //TRACE_ERR(sout.str() << endl); string stats_str = sout.str(); entry.set(stats_str); }