void ScoreStats::Copy(const ScoreStats &stats)
{
  m_available_size = stats.available();
  m_entries = stats.size();
  m_array = new ScoreStatsType[m_available_size];
  memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
}
Beispiel #2
0
void Data::loadNBest(const string &file)
{
  TRACE_ERR("loading nbest from " << file << endl);
  inputfilestream inp(file); // matches a stream with a file. Opens the file
  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  ScoreStats scoreentry;
  string line, sentence_index, sentence, feature_str;

  while (getline(inp, line, '\n')) {
    if (line.empty()) continue;
    // adding statistics for error measures
    scoreentry.clear();

    getNextPound(line, sentence_index, "|||"); // first field
    getNextPound(line, sentence, "|||");       // second field
    getNextPound(line, feature_str, "|||");    // third field

    m_scorer->prepareStats(sentence_index, sentence, scoreentry);
    m_score_data->add(scoreentry, sentence_index);

    // examine first line for name of features
    if (!existsFeatureNames()) {
      InitFeatureMap(feature_str);
    }
    AddFeatures(feature_str, sentence_index);
  }
  inp.close();
}
Beispiel #3
0
void Data::loadNBest(const string &file, bool oneBest)
{
  TRACE_ERR("loading nbest from " << file << endl);
  util::FilePiece in(file.c_str());

  ScoreStats scoreentry;
  string sentence, feature_str, alignment;
  int sentence_index;

  while (true) {
    try {
      StringPiece line = in.ReadLine();
      if (line.empty()) continue;
      // adding statistics for error measures
      scoreentry.clear();

      util::TokenIter<util::MultiCharacter> it(line, util::MultiCharacter("|||"));

      sentence_index = ParseInt(*it);
      if (oneBest && m_score_data->exists(sentence_index)) continue;
      ++it;
      sentence = it->as_string();
      ++it;
      feature_str = it->as_string();
      ++it;

      if (it) {
        ++it;                             // skip model score.

        if (it) {
          alignment = it->as_string(); //fifth field (if present) is either phrase or word alignment
          ++it;
          if (it) {
            alignment = it->as_string(); //sixth field (if present) is word alignment
          }
        }
      }
      //TODO check alignment exists if scorers need it

      if (m_scorer->useAlignment()) {
        sentence += "|||";
        sentence += alignment;
      }
      m_scorer->prepareStats(sentence_index, sentence, scoreentry);

      m_score_data->add(scoreentry, sentence_index);

      // examine first line for name of features
      if (!existsFeatureNames()) {
        InitFeatureMap(feature_str);
      }
      AddFeatures(feature_str, sentence_index);
    } catch (util::EndOfFileException &e) {
      PrintUserTime("Loaded N-best lists");
      break;
    }
  }
}
bool operator==(const ScoreStats& s1, const ScoreStats& s2)
{
  size_t size = s1.size();

  if (size != s2.size())
    return false;

  for (size_t k=0; k < size; k++) {
    if (s1.get(k) != s2.get(k))
      return false;
  }

  return true;
}
Beispiel #5
0
// really not the right place...
float sentenceLevelBleuPlusOne( ScoreStats &stats ) {
	float logbleu = 0.0;
	const unsigned int bleu_order = 4;
	for (unsigned int j=0; j<bleu_order; j++) {
		//cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
		logbleu += log(stats.get(2*j)+1) - log(stats.get(2*j+1)+1);
	}
	logbleu /= bleu_order;
	float brevity = 1.0 - (float)stats.get(bleu_order*2)/stats.get(1);
	if (brevity < 0.0) {
		logbleu += brevity;
	}
	//cerr << brevity << " -> " << exp(logbleu) << endl;
	return exp(logbleu);
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_references.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }
  NgramCounts testcounts;
  // stats for this line
  vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
  string sentence = preprocessSentence(text);
  const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);

  const int reference_len = CalcReferenceLength(sid, length);
  stats.push_back(reference_len);

  //precision on each ngram type
  for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
       testcounts_it != testcounts.end(); ++testcounts_it) {
    const NgramCounts::Value guess = testcounts_it->second;
    const size_t len = testcounts_it->first.size();
    NgramCounts::Value correct = 0;

    NgramCounts::Value v = 0;
    if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
      correct = min(v, guess);
    }
    stats[len * 2 - 2] += correct;
    stats[len * 2 - 1] += guess;
  }
  entry.set(stats);
}
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  stringstream buff;
  string align = text;
  string sentence = "";
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  }

  int i = 0;
  for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) {
    ScoreStats tempEntry;
    if ((*itsc)->useAlignment()) {
      (*itsc)->prepareStats(sid, text, tempEntry);
    }
    else {
      (*itsc)->prepareStats(sid, sentence, tempEntry);
    }
    if (i > 0) buff <<  " ";
    buff << tempEntry;
    i++;
  }
  //cout << " Scores for interpolated: " << buff << endl;
  string str = buff.str();
  entry.set(str);
}
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_lengths.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);

  // Calculate correct, output_length and ref_length for
  // the line and store it in entry
  vector<int> testtokens;
  TokenizeAndEncode(sentence, testtokens);
  multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
  set<int> testtokens_unique(testtokens.begin(),testtokens.end());
  int correct = 0;
  for (set<int>::iterator i = testtokens_unique.begin();
       i != testtokens_unique.end(); ++i) {
    int token = *i;
    correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token));
  }

  ostringstream stats;
  stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ;
  string stats_str = stats.str();
  entry.set(stats_str);
}
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  string sentence = this->preprocessSentence(text);

  vector<ScoreStatsType> stats;
  prepareStatsVector(sid, sentence, stats);
  entry.set(stats);
}
int main(int argc, char **argv)
{
  if (argc == 1) {
    cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
    return 1;
  }

  vector<string> refFiles(argv + 1, argv + argc);

  // TODO all of these are empty for now
  string config;
  string factors;
  string filter;

  BleuScorer scorer(config);
  scorer.setFactors(factors);
  scorer.setFilter(filter);

  // initialize reference streams
  vector<boost::shared_ptr<ifstream> > refStreams;
  for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile) {
    TRACE_ERR("Loading reference from " << *refFile << endl);
    boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
    UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
    refStreams.push_back(ifs);
  }

  // load sentences, preparing statistics, score
  string hypothesisLine;
  size_t sid = 0;
  while (getline(std::cin, hypothesisLine)) {
    Reference ref;
    if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
      UTIL_THROW2("Missing references");
    }
    ScoreStats scoreStats;
    scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
    vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
    std::cout << smoothedSentenceBleu(stats) << std::endl;
    ++sid;
  }

  return 0;
}
void InterpolatedScorer::setScoreData(ScoreData* data)
{
  size_t last = 0;
  m_score_data = data;
  for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
       itsc != m_scorers.end(); ++itsc) {
    int numScoresScorer = (*itsc)->NumberOfScores();
    ScoreData* newData =new ScoreData(*itsc);
    for (size_t i = 0; i < data->size(); i++) {
      ScoreArray scoreArray = data->get(i);
      ScoreArray newScoreArray;
      std::string istr;
      std::stringstream out;
      out << i;
      istr = out.str();
      size_t numNBest = scoreArray.size();
      //cout << " Datasize " << data->size() <<  " NumNBest " << numNBest << endl ;
      for (size_t j = 0; j < numNBest ; j++) {
        ScoreStats scoreStats = data->get(i, j);
        //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
        ScoreStats newScoreStats;
        for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
          ScoreStatsType score = scoreStats.get(k);
          newScoreStats.add(score);
        }
        //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
        newScoreArray.add(newScoreStats);
      }
      newScoreArray.setIndex(istr);
      newData->add(newScoreArray);
    }
    //newData->dump();

    // NOTE: This class takes the ownership of the heap allocated
    // ScoreData objects to avoid the memory leak issues.
    m_scorers_score_data.push_back(newData);

    (*itsc)->setScoreData(newData);
    last += numScoresScorer;
  }
}
Beispiel #12
0
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{

  terAlignment result;
  result.numEdits = 0.0 ;
  result.numWords = 0.0 ;
  result.averageWords = 0.0;

  for ( int incRefs = 0; incRefs < ( int ) m_multi_references.size(); incRefs++ ) {
    if ( sid >= m_multi_references.at(incRefs).size() ) {
      stringstream msg;
      msg << "Sentence id (" << sid << ") not found in reference set";
      throw runtime_error ( msg.str() );
    }

    vector<int> testtokens;
    vector<int> reftokens;
    reftokens = m_multi_references.at ( incRefs ).at ( sid );
    double averageLength=0.0;
    for ( int incRefsBis = 0; incRefsBis < ( int ) m_multi_references.size(); incRefsBis++ ) {
      if ( sid >= m_multi_references.at(incRefsBis).size() ) {
        stringstream msg;
        msg << "Sentence id (" << sid << ") not found in reference set";
        throw runtime_error ( msg.str() );
      }
      averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
    }
    averageLength=averageLength/( double ) m_multi_references.size();
    TokenizeAndEncode(text, testtokens);
    terCalc * evaluation=new terCalc();
    evaluation->setDebugMode ( false );
    terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
    tmp_result.averageWords=averageLength;
    if ( ( result.numEdits == 0.0 ) && ( result.averageWords == 0.0 ) ) {
      result = tmp_result;
    } else if ( result.scoreAv() > tmp_result.scoreAv() ) {
      result = tmp_result;
    }
    delete evaluation;
  }
  ostringstream stats;
  // multiplication by 100 in order to keep the average precision
  // in the TER calculation.
  stats << result.numEdits*100.0 << " " << result.averageWords*100.0 << " " << result.scoreAv()*100.0 << " " ;
  string stats_str = stats.str();
  entry.set ( stats_str );
}
Beispiel #13
0
void HwcmScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_trees.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);

  // if sentence has '|||', assume that tree is in second position (n-best-list);
  // otherwise, assume it is in first position (calling 'evaluate' with tree as reference)
  util::TokenIter<util::MultiCharacter> it(sentence, util::MultiCharacter("|||"));
  ++it;
  if (it) {
    sentence = it->as_string();
  }

  TreePointer tree (boost::make_shared<InternalTree>(sentence));
  vector<map<string, int> > hwc_test (kHwcmOrder);
  vector<string> history(kHwcmOrder);
  extractHeadWordChain(tree, history, hwc_test);

  ostringstream stats;
  for (size_t i = 0; i < kHwcmOrder; i++) {
    int correct = 0;
    int test_total = 0;
    for (map<string, int>::const_iterator it = hwc_test[i].begin(); it != hwc_test[i].end(); it++) {
      test_total += it->second;
      map<string, int>::const_iterator it2 = m_ref_hwc[sid][i].find(it->first);
      if (it2 != m_ref_hwc[sid][i].end()) {
        correct += std::min(it->second, it2->second);
      }
    }
    stats << correct << " " << test_total << " " << m_ref_lengths[sid][i] << " " ;
  }

  string stats_str = stats.str();
  entry.set(stats_str);
}
Beispiel #14
0
void SerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_sentences[0].size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);
  int n_ser = 1; // if we don't find a reference of the same length, the error is 1

  // Check whether the guessed text is equal to the reference 
  // for this line and store it in entry
  vector<int> testtokens;
  TokenizeAndEncode(sentence, testtokens);
  for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
    const sent_t& ref = m_ref_sentences[rid][sid];

    // we can only have a perfect match if the sentence length is equal
    if (testtokens.size() == ref.size()) {
      int errors = 0;
      for (size_t tid = 0; tid < testtokens.size(); tid++) {
	// token mismatch: error 1 w.r.t. this reference; move to next ref.
	if (ref[tid] != testtokens[tid]) { 
	  errors = 1;
	  break;
	}
      }
      if (errors == 0) {
        n_ser = 0;
        break;
      }
    }
  }
  ostringstream stats;
  stats << n_ser << " " << 1; // sentence error (0 or 1), number of sentences (1)
  string stats_str = stats.str();
  entry.set(stats_str);
}
void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  vector<ScoreStatsType> stats;

  const string& sentence = preprocessSentence(text);
  str_sentence_t splitCandSentence;
  splitSentence(sentence, splitCandSentence);

  sentence_t encodedCandSentence;
  encodeSentence(splitCandSentence, encodedCandSentence);

  if (m_ref_sentences.size() == 1) {
    stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]);
  } else {
    float max = -1.0f;
    for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
      const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]);
      if (m_ovr->calculateScore(tmp) > max) {
        stats = tmp;
      }
    }
  }
  entry.set(stats);
}
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  //cout << "*******prepareStats" ;
  //cout << text << endl;
  //cout << sid << endl;
  //cout << "Reference0align:" << endl;
  //m_referencePerms[0][sid].dump();


  string sentence = "";
  string align = text;
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  } else {
    align = text;
  }
  int translationLength = getNumberWords(sentence);


  //A vector of Permutations for each sentence
  vector< vector<Permutation> > nBestPerms;
  float distanceValue;

  //need to create permutations for each nbest line
  string standardFormat = Permutation::convertMosesToStandard(align);
  Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
  //perm.dump();

  if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
    float total = 0;
    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      total += dist;
      //cout << "Ref number: " << i << " distance: " << dist << endl;
    }
    float mean = (float)total/m_referencePerms.size();
    //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
    distanceValue = mean;
  } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST)  {
    float max_val = 0;

    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      //look for the closest reference
      float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      //cout << "Ref number: " << i << " distance: " << value << endl;
      if (value > max_val) {
        max_val = value;
      }
    }
    distanceValue = max_val;
    //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
  } else {
    throw runtime_error("Unsupported reflength strategy");
  }

  //SCOREROUT eg: 0.04546
  ostringstream tempStream;
  tempStream.precision(SCORE_PRECISION);
  tempStream << distanceValue;
  string str = tempStream.str();
  entry.set(str);

  //cout << tempStream.str();
}
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
//      cerr << text << endl;
//      cerr << sid << endl;
  //dump_counts(*m_ref_counts[sid]);
  if (sid >= m_ref_counts.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }
  counts_t testcounts;
  //stats for this line
  vector<float> stats(kLENGTH*2);;
  size_t length = countNgrams(text,testcounts,kLENGTH);
  //dump_counts(testcounts);
  if (m_ref_length_type == SHORTEST) {
    //cerr << reflengths.size() << " " << sid << endl;
    int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end());
    stats.push_back(shortest);
  } else if (m_ref_length_type == AVERAGE) {
    int total = 0;
    for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
      total += m_ref_lengths[sid][i];
    }
    const float mean = static_cast<float>(total) / m_ref_lengths[sid].size();
    stats.push_back(mean);
  } else if (m_ref_length_type == CLOSEST)  {
    int min_diff = INT_MAX;
    int min_idx = 0;
    for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
      const int reflength = m_ref_lengths[sid][i];
      const int diff = reflength - static_cast<int>(length);
      const int absolute_diff = abs(diff) - abs(min_diff);

      if (absolute_diff < 0) { //look for the closest reference
        min_diff = diff;
        min_idx = i;
      } else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest
        if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) {
          min_idx = i;
        }
      }
    }
    stats.push_back(m_ref_lengths[sid][min_idx]);
  } else {
    throw runtime_error("Unsupported reflength strategy");
  }
  //cerr << "computed length" << endl;
  //precision on each ngram type
  for (counts_iterator testcounts_it = testcounts.begin();
       testcounts_it != testcounts.end(); ++testcounts_it) {
    counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
    int correct = 0;
    int guess = testcounts_it->second;
    if (refcounts_it != m_ref_counts[sid]->end()) {
      correct = min(refcounts_it->second,guess);
    }
    size_t len = testcounts_it->first.size();
    stats[len*2-2] += correct;
    stats[len*2-1] += guess;
  }
  stringstream sout;
  copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
  //TRACE_ERR(sout.str() << endl);
  string stats_str = sout.str();
  entry.set(stats_str);
}
Beispiel #18
0
void Data::loadnbest(const std::string &file)
{
  TRACE_ERR("loading nbest from " << file << std::endl);

  FeatureStats featentry;
  ScoreStats scoreentry;
  std::string sentence_index;

  inputfilestream inp(file); // matches a stream with a file. Opens the file

  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  std::string substring, subsubstring, stringBuf;
  std::string theSentence;
  std::string::size_type loc;

  while (getline(inp,stringBuf,'\n')) {
    if (stringBuf.empty()) continue;

//		TRACE_ERR("stringBuf: " << stringBuf << std::endl);

    getNextPound(stringBuf, substring, "|||"); //first field
    sentence_index = substring;

    getNextPound(stringBuf, substring, "|||"); //second field
    theSentence = substring;

// adding statistics for error measures
    featentry.reset();
    scoreentry.clear();

    theScorer->prepareStats(sentence_index, theSentence, scoreentry);

    scoredata->add(scoreentry, sentence_index);

    getNextPound(stringBuf, substring, "|||"); //third field

    // examine first line for name of features
    if (!existsFeatureNames()) {
      std::string stringsupport=substring;
      std::string features="";
      std::string tmpname="";

      size_t tmpidx=0;
      while (!stringsupport.empty()) {
        //			TRACE_ERR("Decompounding: " << substring << std::endl);
        getNextPound(stringsupport, subsubstring);

        // string ending with ":" are skipped, because they are the names of the features
        if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
          features+=tmpname+"_"+stringify(tmpidx)+" ";
          tmpidx++;
        }
        // ignore sparse feature name
        else if (subsubstring.find("_") != string::npos) {
          // also ignore its value
          getNextPound(stringsupport, subsubstring);
        }
        // update current feature name
        else {
          tmpidx=0;
          tmpname=subsubstring.substr(0,subsubstring.size() - 1);
        }
      }

      featdata->setFeatureMap(features);
    }

    // adding features
    while (!substring.empty()) {
//			TRACE_ERR("Decompounding: " << substring << std::endl);
      getNextPound(substring, subsubstring);

      // no ':' -> feature value that needs to be stored
      if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
        featentry.add(ATOFST(subsubstring.c_str()));
      }
      // sparse feature name? store as well
      else if (subsubstring.find("_") != string::npos) {
        std::string name = subsubstring;
        getNextPound(substring, subsubstring);
        featentry.addSparse( name, atof(subsubstring.c_str()) );
        _sparse_flag = true;
      }
    }
    //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
    featdata->add(featentry,sentence_index);
  }

  inp.close();
}
Beispiel #19
0
void  StatisticsBasedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
                                   statscores_t& scores) const
{
  if (!m_score_data) {
    throw runtime_error("Score data not loaded");
  }
  // calculate the score for the candidates
  if (m_score_data->size() == 0) {
    throw runtime_error("Score data is empty");
  }
  if (candidates.size() == 0) {
    throw runtime_error("No candidates supplied");
  }
  int numCounts = m_score_data->get(0,candidates[0]).size();
  vector<int> totals(numCounts);
  for (size_t i = 0; i < candidates.size(); ++i) {
    ScoreStats stats = m_score_data->get(i,candidates[i]);
    if (stats.size() != totals.size()) {
      stringstream msg;
      msg << "Statistics for (" << "," << candidates[i] << ") have incorrect "
          << "number of fields. Found: " << stats.size() << " Expected: "
          << totals.size();
      throw runtime_error(msg.str());
    }
    for (size_t k = 0; k < totals.size(); ++k) {
      totals[k] += stats.get(k);
    }
  }
  scores.push_back(calculateScore(totals));

  candidates_t last_candidates(candidates);
  // apply each of the diffs, and get new scores
  for (size_t i = 0; i < diffs.size(); ++i) {
    for (size_t j = 0; j < diffs[i].size(); ++j) {
      size_t sid = diffs[i][j].first;
      size_t nid = diffs[i][j].second;
      size_t last_nid = last_candidates[sid];
      for (size_t k  = 0; k < totals.size(); ++k) {
        int diff = m_score_data->get(sid,nid).get(k)
                   - m_score_data->get(sid,last_nid).get(k);
        totals[k] += diff;
      }
      last_candidates[sid] = nid;
    }
    scores.push_back(calculateScore(totals));
  }

  // Regularisation. This can either be none, or the min or average as described in
  // Cer, Jurafsky and Manning at WMT08.
  if (m_regularization_type == NONE || m_regularization_window <= 0) {
    // no regularisation
    return;
  }

  // window size specifies the +/- in each direction
  statscores_t raw_scores(scores);      // copy scores
  for (size_t i = 0; i < scores.size(); ++i) {
    size_t start = 0;
    if (i >= m_regularization_window) {
      start = i - m_regularization_window;
    }
    const size_t end = min(scores.size(), i + m_regularization_window + 1);
    if (m_regularization_type == AVERAGE) {
      scores[i] = score_average(raw_scores,start,end);
    } else {
      scores[i] = score_min(raw_scores,start,end);
    }
  }
}
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  //bool debug= (verboselevel()>3); // TODO: fix verboselevel()
  bool debug=false;
  if (debug) {
    cout << "*******prepareStats" ;
    cout << text << endl;
    cout << sid << endl;
    cout << "Reference0align:" << endl;
    m_referencePerms[0][sid].dump();
  }

  string sentence = "";
  string align = text;
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  } else {
    align = text;
  }
  int translationLength = getNumberWords(sentence);


  //A vector of Permutations for each sentence
  vector< vector<Permutation> > nBestPerms;
  float distanceValue;

  //need to create permutations for each nbest line
  //here we check if the alignments extracted from the nbest are phrase-based or word-based, in which case no conversion is needed
  bool isWordAlignment=true;
  string alignCopy = align;
  string align1;
  getNextPound(alignCopy,align1," ");
  if (align1.length() > 0) {
    size_t phraseDelimeter = align1.find("=");
    if(phraseDelimeter!= string::npos)
      isWordAlignment=false;
  }
  string standardFormat = align;
  if(!isWordAlignment)
    standardFormat= Permutation::convertMosesToStandard(align);

  if (debug) {
    cerr << "Nbest alignment:  " << align << endl;
    cerr << "-->std alignment: " << standardFormat << endl;
  }

  Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
  //perm.dump();

  if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
    float total = 0;
    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      total += dist;
      //cout << "Ref number: " << i << " distance: " << dist << endl;
    }
    float mean = (float)total/m_referencePerms.size();
    //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
    distanceValue = mean;
  } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST)  {
    float max_val = 0;

    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      //look for the closest reference
      float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      //cout << "Ref number: " << i << " distance: " << value << endl;
      if (value > max_val) {
        max_val = value;
      }
    }
    distanceValue = max_val;
    //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
  } else {
    throw runtime_error("Unsupported reflength strategy");
  }

  //SCOREROUT eg: 0.04546
  distanceValue*=SCORE_MULTFACT; //SCOREROUT eg: 4546 to transform float into integer
  ostringstream tempStream;
  tempStream.precision(0);	// decimal precision not needed as score was multiplied per SCORE_MULTFACT
  tempStream << std::fixed << distanceValue << " 1"; //use for final normalization over the amount of test sentences
  string str = tempStream.str();
  entry.set(str);

//cout << distanceValue << "=" << distanceValue << " (str:" << tempStream.str() << ")" << endl;
}
Beispiel #21
0
void DataAsiya::loadNBest(const string &file)
{
  TRACE_ERR("loading nbest from DataAsiya " << file << endl);
  inputfilestream inp(file); // matches a stream with a file. Opens the file
  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  ScoreStats scoreentry;
  string line, sentence_index, sentence, feature_str, alignment;

  AsiyaScorer* a_scorer = dynamic_cast<AsiyaScorer*>(m_scorer);
  
  /*todo. change this loop. instead of obtaining the score for each sentence, obtain all the scores at once!*/
  while (getline(inp, line, '\n')) {
    if (line.empty()) continue;
    // adding statistics for error measures
    scoreentry.clear();

    getNextPound(line, sentence_index, "|||"); // first field
    getNextPound(line, sentence, "|||");       // second field
    getNextPound(line, feature_str, "|||");    // third field

    if (line.length() > 0) {
      string temp;
      getNextPound(line, temp, "|||"); //fourth field sentence score
      if (line.length() > 0) {
        getNextPound(line, alignment, "|||"); //fourth field only there if alignment scorer
      }
    }
    //TODO check alignment exists if scorers need it
    if (a_scorer->useAlignment()) {
      sentence += "|||";
      sentence += alignment;
    }
    // prepare stats gets all the scores for sentence_i of sentence_index
//    a_scorer->addCandidateSentence(sentence_index, sentence);
    a_scorer->prepareStats(atoi(sentence_index.c_str()), sentence, scoreentry);

    // examine first line for name of features
    if (!existsFeatureNames()) {
      InitFeatureMap(feature_str);
    }
    AddFeatures(feature_str, atoi(sentence_index.c_str()));
  }

  a_scorer->doScoring();
//  TRACE_ERR("before getAllScoreStats" << endl);

  std::vector<std::vector <ScoreStats> > allScoreStats = a_scorer->getAllScoreStats();
  for (int i = 0; i < allScoreStats.size(); ++i)
      for(int j = 0; j < allScoreStats[i].size(); ++j)
      {
          stringstream ss;
          ss << i;
          m_score_data->add(allScoreStats[i][j], atoi(ss.str().c_str()));
//          TRACE_ERR("allScoreStats[" << i << "].size() " << allScoreStats[i].size() << " " << allScoreStats[i][j] << endl);
      }


  inp.close();
//  a_scorer->doScoring( m_score_data );

  //score each sentence
  //a_scorer->prepareStats(sentence_index, sentence, scoreentry);
  // save the score for previous sentence. Do it aling with previous function
  //m_score_data->add(scoreentry, sentence_index);  
  
}