void ScoreStats::Copy(const ScoreStats &stats) { m_available_size = stats.available(); m_entries = stats.size(); m_array = new ScoreStatsType[m_available_size]; memcpy(m_array, stats.getArray(), GetArraySizeWithBytes()); }
void Data::loadNBest(const string &file) { TRACE_ERR("loading nbest from " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); ScoreStats scoreentry; string line, sentence_index, sentence, feature_str; while (getline(inp, line, '\n')) { if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); getNextPound(line, sentence_index, "|||"); // first field getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field m_scorer->prepareStats(sentence_index, sentence, scoreentry); m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { InitFeatureMap(feature_str); } AddFeatures(feature_str, sentence_index); } inp.close(); }
void Data::loadNBest(const string &file, bool oneBest) { TRACE_ERR("loading nbest from " << file << endl); util::FilePiece in(file.c_str()); ScoreStats scoreentry; string sentence, feature_str, alignment; int sentence_index; while (true) { try { StringPiece line = in.ReadLine(); if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); util::TokenIter<util::MultiCharacter> it(line, util::MultiCharacter("|||")); sentence_index = ParseInt(*it); if (oneBest && m_score_data->exists(sentence_index)) continue; ++it; sentence = it->as_string(); ++it; feature_str = it->as_string(); ++it; if (it) { ++it; // skip model score. if (it) { alignment = it->as_string(); //fifth field (if present) is either phrase or word alignment ++it; if (it) { alignment = it->as_string(); //sixth field (if present) is word alignment } } } //TODO check alignment exists if scorers need it if (m_scorer->useAlignment()) { sentence += "|||"; sentence += alignment; } m_scorer->prepareStats(sentence_index, sentence, scoreentry); m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { InitFeatureMap(feature_str); } AddFeatures(feature_str, sentence_index); } catch (util::EndOfFileException &e) { PrintUserTime("Loaded N-best lists"); break; } } }
bool operator==(const ScoreStats& s1, const ScoreStats& s2) { size_t size = s1.size(); if (size != s2.size()) return false; for (size_t k=0; k < size; k++) { if (s1.get(k) != s2.get(k)) return false; } return true; }
// really not the right place... float sentenceLevelBleuPlusOne( ScoreStats &stats ) { float logbleu = 0.0; const unsigned int bleu_order = 4; for (unsigned int j=0; j<bleu_order; j++) { //cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " "; logbleu += log(stats.get(2*j)+1) - log(stats.get(2*j+1)+1); } logbleu /= bleu_order; float brevity = 1.0 - (float)stats.get(bleu_order*2)/stats.get(1); if (brevity < 0.0) { logbleu += brevity; } //cerr << brevity << " -> " << exp(logbleu) << endl; return exp(logbleu); }
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_references.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } NgramCounts testcounts; // stats for this line vector<ScoreStatsType> stats(kBleuNgramOrder * 2); string sentence = preprocessSentence(text); const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true); const int reference_len = CalcReferenceLength(sid, length); stats.push_back(reference_len); //precision on each ngram type for (NgramCounts::const_iterator testcounts_it = testcounts.begin(); testcounts_it != testcounts.end(); ++testcounts_it) { const NgramCounts::Value guess = testcounts_it->second; const size_t len = testcounts_it->first.size(); NgramCounts::Value correct = 0; NgramCounts::Value v = 0; if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) { correct = min(v, guess); } stats[len * 2 - 2] += correct; stats[len * 2 - 1] += guess; } entry.set(stats); }
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { stringstream buff; string align = text; string sentence = ""; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } int i = 0; for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) { ScoreStats tempEntry; if ((*itsc)->useAlignment()) { (*itsc)->prepareStats(sid, text, tempEntry); } else { (*itsc)->prepareStats(sid, sentence, tempEntry); } if (i > 0) buff << " "; buff << tempEntry; i++; } //cout << " Scores for interpolated: " << buff << endl; string str = buff.str(); entry.set(str); }
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_lengths.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); // Calculate correct, output_length and ref_length for // the line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); multiset<int> testtokens_all(testtokens.begin(),testtokens.end()); set<int> testtokens_unique(testtokens.begin(),testtokens.end()); int correct = 0; for (set<int>::iterator i = testtokens_unique.begin(); i != testtokens_unique.end(); ++i) { int token = *i; correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token)); } ostringstream stats; stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ; string stats_str = stats.str(); entry.set(stats_str); }
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { string sentence = this->preprocessSentence(text); vector<ScoreStatsType> stats; prepareStatsVector(sid, sentence, stats); entry.set(stats); }
int main(int argc, char **argv) { if (argc == 1) { cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl; return 1; } vector<string> refFiles(argv + 1, argv + argc); // TODO all of these are empty for now string config; string factors; string filter; BleuScorer scorer(config); scorer.setFactors(factors); scorer.setFilter(filter); // initialize reference streams vector<boost::shared_ptr<ifstream> > refStreams; for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile) { TRACE_ERR("Loading reference from " << *refFile << endl); boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str())); UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile); refStreams.push_back(ifs); } // load sentences, preparing statistics, score string hypothesisLine; size_t sid = 0; while (getline(std::cin, hypothesisLine)) { Reference ref; if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) { UTIL_THROW2("Missing references"); } ScoreStats scoreStats; scorer.CalcBleuStats(ref, hypothesisLine, scoreStats); vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size()); std::cout << smoothedSentenceBleu(stats) << std::endl; ++sid; } return 0; }
void InterpolatedScorer::setScoreData(ScoreData* data) { size_t last = 0; m_score_data = data; for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) { int numScoresScorer = (*itsc)->NumberOfScores(); ScoreData* newData =new ScoreData(*itsc); for (size_t i = 0; i < data->size(); i++) { ScoreArray scoreArray = data->get(i); ScoreArray newScoreArray; std::string istr; std::stringstream out; out << i; istr = out.str(); size_t numNBest = scoreArray.size(); //cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ; for (size_t j = 0; j < numNBest ; j++) { ScoreStats scoreStats = data->get(i, j); //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl; ScoreStats newScoreStats; for (size_t k = last; k < size_t(numScoresScorer + last); k++) { ScoreStatsType score = scoreStats.get(k); newScoreStats.add(score); } //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl; newScoreArray.add(newScoreStats); } newScoreArray.setIndex(istr); newData->add(newScoreArray); } //newData->dump(); // NOTE: This class takes the ownership of the heap allocated // ScoreData objects to avoid the memory leak issues. m_scorers_score_data.push_back(newData); (*itsc)->setScoreData(newData); last += numScoresScorer; } }
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry ) { terAlignment result; result.numEdits = 0.0 ; result.numWords = 0.0 ; result.averageWords = 0.0; for ( int incRefs = 0; incRefs < ( int ) m_multi_references.size(); incRefs++ ) { if ( sid >= m_multi_references.at(incRefs).size() ) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error ( msg.str() ); } vector<int> testtokens; vector<int> reftokens; reftokens = m_multi_references.at ( incRefs ).at ( sid ); double averageLength=0.0; for ( int incRefsBis = 0; incRefsBis < ( int ) m_multi_references.size(); incRefsBis++ ) { if ( sid >= m_multi_references.at(incRefsBis).size() ) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error ( msg.str() ); } averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size(); } averageLength=averageLength/( double ) m_multi_references.size(); TokenizeAndEncode(text, testtokens); terCalc * evaluation=new terCalc(); evaluation->setDebugMode ( false ); terAlignment tmp_result = evaluation->TER ( reftokens, testtokens ); tmp_result.averageWords=averageLength; if ( ( result.numEdits == 0.0 ) && ( result.averageWords == 0.0 ) ) { result = tmp_result; } else if ( result.scoreAv() > tmp_result.scoreAv() ) { result = tmp_result; } delete evaluation; } ostringstream stats; // multiplication by 100 in order to keep the average precision // in the TER calculation. stats << result.numEdits*100.0 << " " << result.averageWords*100.0 << " " << result.scoreAv()*100.0 << " " ; string stats_str = stats.str(); entry.set ( stats_str ); }
void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_trees.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); // if sentence has '|||', assume that tree is in second position (n-best-list); // otherwise, assume it is in first position (calling 'evaluate' with tree as reference) util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||")); ++it; if (it) { sentence = it->as_string(); } TreePointer tree (boost::make_shared<InternalTree>(sentence)); vector<map<string, int> > hwc_test (kHwcmOrder); vector<string> history(kHwcmOrder); extractHeadWordChain(tree, history, hwc_test); ostringstream stats; for (size_t i = 0; i < kHwcmOrder; i++) { int correct = 0; int test_total = 0; for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) { test_total += it->second; map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first); if (it2 != m_ref_hwc[sid][i].end()) { correct += std::min(it->second, it2->second); } } stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ; } string stats_str = stats.str(); entry.set(stats_str); }
void SerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_sentences[0].size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); int n_ser = 1; // if we don't find a reference of the same length, the error is 1 // Check whether the guessed text is equal to the reference // for this line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const sent_t& ref = m_ref_sentences[rid][sid]; // we can only have a perfect match if the sentence length is equal if (testtokens.size() == ref.size()) { int errors = 0; for (size_t tid = 0; tid < testtokens.size(); tid++) { // token mismatch: error 1 w.r.t. this reference; move to next ref. if (ref[tid] != testtokens[tid]) { errors = 1; break; } } if (errors == 0) { n_ser = 0; break; } } } ostringstream stats; stats << n_ser << " " << 1; // sentence error (0 or 1), number of sentences (1) string stats_str = stats.str(); entry.set(stats_str); }
void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { vector<ScoreStatsType> stats; const string& sentence = preprocessSentence(text); str_sentence_t splitCandSentence; splitSentence(sentence, splitCandSentence); sentence_t encodedCandSentence; encodeSentence(splitCandSentence, encodedCandSentence); if (m_ref_sentences.size() == 1) { stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]); } else { float max = -1.0f; for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]); if (m_ovr->calculateScore(tmp) > max) { stats = tmp; } } } entry.set(stats); }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //cout << "*******prepareStats" ; //cout << text << endl; //cout << sid << endl; //cout << "Reference0align:" << endl; //m_referencePerms[0][sid].dump(); string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line string standardFormat = Permutation::convertMosesToStandard(align); Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 ostringstream tempStream; tempStream.precision(SCORE_PRECISION); tempStream << distanceValue; string str = tempStream.str(); entry.set(str); //cout << tempStream.str(); }
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { // cerr << text << endl; // cerr << sid << endl; //dump_counts(*m_ref_counts[sid]); if (sid >= m_ref_counts.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } counts_t testcounts; //stats for this line vector<float> stats(kLENGTH*2);; size_t length = countNgrams(text,testcounts,kLENGTH); //dump_counts(testcounts); if (m_ref_length_type == SHORTEST) { //cerr << reflengths.size() << " " << sid << endl; int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end()); stats.push_back(shortest); } else if (m_ref_length_type == AVERAGE) { int total = 0; for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) { total += m_ref_lengths[sid][i]; } const float mean = static_cast<float>(total) / m_ref_lengths[sid].size(); stats.push_back(mean); } else if (m_ref_length_type == CLOSEST) { int min_diff = INT_MAX; int min_idx = 0; for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) { const int reflength = m_ref_lengths[sid][i]; const int diff = reflength - static_cast<int>(length); const int absolute_diff = abs(diff) - abs(min_diff); if (absolute_diff < 0) { //look for the closest reference min_diff = diff; min_idx = i; } else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) { min_idx = i; } } } stats.push_back(m_ref_lengths[sid][min_idx]); } else { throw runtime_error("Unsupported reflength strategy"); } //cerr << "computed length" << endl; //precision on each ngram type for (counts_iterator testcounts_it = testcounts.begin(); testcounts_it != testcounts.end(); ++testcounts_it) { counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first); int correct = 0; int guess = testcounts_it->second; if (refcounts_it != m_ref_counts[sid]->end()) { correct = min(refcounts_it->second,guess); } size_t len = testcounts_it->first.size(); stats[len*2-2] += correct; stats[len*2-1] += guess; } stringstream sout; copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," ")); //TRACE_ERR(sout.str() << endl); string stats_str = sout.str(); entry.set(stats_str); }
void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); FeatureStats featentry; ScoreStats scoreentry; std::string sentence_index; inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); std::string substring, subsubstring, stringBuf; std::string theSentence; std::string::size_type loc; while (getline(inp,stringBuf,'\n')) { if (stringBuf.empty()) continue; // TRACE_ERR("stringBuf: " << stringBuf << std::endl); getNextPound(stringBuf, substring, "|||"); //first field sentence_index = substring; getNextPound(stringBuf, substring, "|||"); //second field theSentence = substring; // adding statistics for error measures featentry.reset(); scoreentry.clear(); theScorer->prepareStats(sentence_index, theSentence, scoreentry); scoredata->add(scoreentry, sentence_index); getNextPound(stringBuf, substring, "|||"); //third field // examine first line for name of features if (!existsFeatureNames()) { std::string stringsupport=substring; std::string features=""; std::string tmpname=""; size_t tmpidx=0; while (!stringsupport.empty()) { // TRACE_ERR("Decompounding: " << substring << std::endl); getNextPound(stringsupport, subsubstring); // string ending with ":" are skipped, because they are the names of the features if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { features+=tmpname+"_"+stringify(tmpidx)+" "; tmpidx++; } // ignore sparse feature name else if (subsubstring.find("_") != string::npos) { // also ignore its value getNextPound(stringsupport, subsubstring); } // update current feature name else { tmpidx=0; tmpname=subsubstring.substr(0,subsubstring.size() - 1); } } featdata->setFeatureMap(features); } // adding features while (!substring.empty()) { // TRACE_ERR("Decompounding: " << substring << std::endl); getNextPound(substring, subsubstring); // no ':' -> feature value that needs to be stored if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { featentry.add(ATOFST(subsubstring.c_str())); } // sparse feature name? store as well else if (subsubstring.find("_") != string::npos) { std::string name = subsubstring; getNextPound(substring, subsubstring); featentry.addSparse( name, atof(subsubstring.c_str()) ); _sparse_flag = true; } } //cerr << "number of sparse features: " << featentry.getSparse().size() << endl; featdata->add(featentry,sentence_index); } inp.close(); }
void StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs, statscores_t& scores) const { if (!m_score_data) { throw runtime_error("Score data not loaded"); } // calculate the score for the candidates if (m_score_data->size() == 0) { throw runtime_error("Score data is empty"); } if (candidates.size() == 0) { throw runtime_error("No candidates supplied"); } int numCounts = m_score_data->get(0,candidates[0]).size(); vector<int> totals(numCounts); for (size_t i = 0; i < candidates.size(); ++i) { ScoreStats stats = m_score_data->get(i,candidates[i]); if (stats.size() != totals.size()) { stringstream msg; msg << "Statistics for (" << "," << candidates[i] << ") have incorrect " << "number of fields. Found: " << stats.size() << " Expected: " << totals.size(); throw runtime_error(msg.str()); } for (size_t k = 0; k < totals.size(); ++k) { totals[k] += stats.get(k); } } scores.push_back(calculateScore(totals)); candidates_t last_candidates(candidates); // apply each of the diffs, and get new scores for (size_t i = 0; i < diffs.size(); ++i) { for (size_t j = 0; j < diffs[i].size(); ++j) { size_t sid = diffs[i][j].first; size_t nid = diffs[i][j].second; size_t last_nid = last_candidates[sid]; for (size_t k = 0; k < totals.size(); ++k) { int diff = m_score_data->get(sid,nid).get(k) - m_score_data->get(sid,last_nid).get(k); totals[k] += diff; } last_candidates[sid] = nid; } scores.push_back(calculateScore(totals)); } // Regularisation. This can either be none, or the min or average as described in // Cer, Jurafsky and Manning at WMT08. if (m_regularization_type == NONE || m_regularization_window <= 0) { // no regularisation return; } // window size specifies the +/- in each direction statscores_t raw_scores(scores); // copy scores for (size_t i = 0; i < scores.size(); ++i) { size_t start = 0; if (i >= m_regularization_window) { start = i - m_regularization_window; } const size_t end = min(scores.size(), i + m_regularization_window + 1); if (m_regularization_type == AVERAGE) { scores[i] = score_average(raw_scores,start,end); } else { scores[i] = score_min(raw_scores,start,end); } } }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //bool debug= (verboselevel()>3); // TODO: fix verboselevel() bool debug=false; if (debug) { cout << "*******prepareStats" ; cout << text << endl; cout << sid << endl; cout << "Reference0align:" << endl; m_referencePerms[0][sid].dump(); } string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line //here we check if the alignments extracted from the nbest are phrase-based or word-based, in which case no conversion is needed bool isWordAlignment=true; string alignCopy = align; string align1; getNextPound(alignCopy,align1," "); if (align1.length() > 0) { size_t phraseDelimeter = align1.find("="); if(phraseDelimeter!= string::npos) isWordAlignment=false; } string standardFormat = align; if(!isWordAlignment) standardFormat= Permutation::convertMosesToStandard(align); if (debug) { cerr << "Nbest alignment: " << align << endl; cerr << "-->std alignment: " << standardFormat << endl; } Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 distanceValue*=SCORE_MULTFACT; //SCOREROUT eg: 4546 to transform float into integer ostringstream tempStream; tempStream.precision(0); // decimal precision not needed as score was multiplied per SCORE_MULTFACT tempStream << std::fixed << distanceValue << " 1"; //use for final normalization over the amount of test sentences string str = tempStream.str(); entry.set(str); //cout << distanceValue << "=" << distanceValue << " (str:" << tempStream.str() << ")" << endl; }
void DataAsiya::loadNBest(const string &file) { TRACE_ERR("loading nbest from DataAsiya " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); ScoreStats scoreentry; string line, sentence_index, sentence, feature_str, alignment; AsiyaScorer* a_scorer = dynamic_cast<AsiyaScorer*>(m_scorer); /*todo. change this loop. instead of obtaining the score for each sentence, obtain all the scores at once!*/ while (getline(inp, line, '\n')) { if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); getNextPound(line, sentence_index, "|||"); // first field getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field if (line.length() > 0) { string temp; getNextPound(line, temp, "|||"); //fourth field sentence score if (line.length() > 0) { getNextPound(line, alignment, "|||"); //fourth field only there if alignment scorer } } //TODO check alignment exists if scorers need it if (a_scorer->useAlignment()) { sentence += "|||"; sentence += alignment; } // prepare stats gets all the scores for sentence_i of sentence_index // a_scorer->addCandidateSentence(sentence_index, sentence); a_scorer->prepareStats(atoi(sentence_index.c_str()), sentence, scoreentry); // examine first line for name of features if (!existsFeatureNames()) { InitFeatureMap(feature_str); } AddFeatures(feature_str, atoi(sentence_index.c_str())); } a_scorer->doScoring(); // TRACE_ERR("before getAllScoreStats" << endl); std::vector<std::vector <ScoreStats> > allScoreStats = a_scorer->getAllScoreStats(); for (int i = 0; i < allScoreStats.size(); ++i) for(int j = 0; j < allScoreStats[i].size(); ++j) { stringstream ss; ss << i; m_score_data->add(allScoreStats[i][j], atoi(ss.str().c_str())); // TRACE_ERR("allScoreStats[" << i << "].size() " << allScoreStats[i].size() << " " << allScoreStats[i][j] << endl); } inp.close(); // a_scorer->doScoring( m_score_data ); //score each sentence //a_scorer->prepareStats(sentence_index, sentence, scoreentry); // save the score for previous sentence. Do it aling with previous function //m_score_data->add(scoreentry, sentence_index); }