Beispiel #1
0
void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  // For each line in the reference file, create a multiset of
  // the word ids.
  if (referenceFiles.size() != 1) {
    throw runtime_error("PER only supports a single reference");
  }
  m_ref_tokens.clear();
  m_ref_lengths.clear();
  ifstream in(referenceFiles[0].c_str());
  if (!in) {
    throw runtime_error("Unable to open " + referenceFiles[0]);
  }
  string line;
  int sid = 0;
  while (getline(in,line)) {
    line = this->preprocessSentence(line);
    vector<int> tokens;
    TokenizeAndEncode(line, tokens);
    m_ref_tokens.push_back(multiset<int>());
    for (size_t i = 0; i < tokens.size(); ++i) {
      m_ref_tokens.back().insert(tokens[i]);
    }
    m_ref_lengths.push_back(tokens.size());
    if (sid > 0 && sid % 100 == 0) {
      TRACE_ERR(".");
    }
    ++sid;
  }
  TRACE_ERR(endl);

}
Beispiel #2
0
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_lengths.size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);

  // Calculate correct, output_length and ref_length for
  // the line and store it in entry
  vector<int> testtokens;
  TokenizeAndEncode(sentence, testtokens);
  multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
  set<int> testtokens_unique(testtokens.begin(),testtokens.end());
  int correct = 0;
  for (set<int>::iterator i = testtokens_unique.begin();
       i != testtokens_unique.end(); ++i) {
    int token = *i;
    correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token));
  }

  ostringstream stats;
  stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ;
  string stats_str = stats.str();
  entry.set(stats_str);
}
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
                               unsigned int n, bool is_testing)
{
  assert(n > 0);
  vector<int> encoded_tokens;

  // When performing tokenization of a hypothesis translation, we don't have
  // to update the Scorer's word vocabulary. However, the tokenization of
  // reference translations requires modifying the vocabulary, which means
  // this procedure might be slower than the tokenization the hypothesis
  // translation.
  if (is_testing) {
    TokenizeAndEncodeTesting(line, encoded_tokens);
  } else {
    TokenizeAndEncode(line, encoded_tokens);
  }
  const size_t len = encoded_tokens.size();
  vector<int> ngram;

  for (size_t k = 1; k <= n; ++k) {
    //ngram order longer than sentence - no point
    if (k > len) {
      continue;
    }
    for (size_t i = 0; i < len - k + 1; ++i) {
      ngram.clear();
      ngram.reserve(len);
      for (size_t j = i; j < i+k && j < len; ++j) {
        ngram.push_back(encoded_tokens[j]);
      }
      counts.Add(ngram);
    }
  }
  return len;
}
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
{
  vector<int> encoded_tokens;
  //cerr << line << endl;
  TokenizeAndEncode(line, encoded_tokens);
  //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
  //cerr << endl;
  for (size_t k = 1; k <= n; ++k) {
    //ngram order longer than sentence - no point
    if (k > encoded_tokens.size()) {
      continue;
    }
    for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) {
      vector<int> ngram;
      for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
        ngram.push_back(encoded_tokens[j]);
      }
      int count = 1;
      counts_iterator oldcount = counts.find(ngram);
      if (oldcount != counts.end()) {
        count = (oldcount->second) + 1;
      }
      //cerr << count << endl;
      counts[ngram] = count;
      //cerr << endl;
    }
  }
  //cerr << "counted ngrams" << endl;
  //dump_counts(counts);
  return encoded_tokens.size();
}
Beispiel #5
0
void SerScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  //make sure reference data is clear
  m_ref_sentences.clear();

  //load reference data
  for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
    ifstream refin(referenceFiles[rid].c_str());
    if (!refin) {
      throw runtime_error("Unable to open: " + referenceFiles[rid]);
    }
    m_ref_sentences.push_back(vector<sent_t>());
    string line;
    while (getline(refin,line)) {
      line = this->preprocessSentence(line);
      sent_t encoded;
      TokenizeAndEncode(line, encoded);
      m_ref_sentences[rid].push_back(encoded);
    }
  }
}
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats)
{
  sent_t cand;
  TokenizeAndEncode(text, cand);

  float max = -2;
  vector<ScoreStatsType> tmp;
  for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
    const sent_t& ref = m_ref_sentences[rid][sid];
    tmp.clear();
    computeCD(cand, ref, tmp);
    int score = calculateScore(tmp);
    if (rid == 0) {
      stats = tmp;
      max = score;
    } else if (score > max) {
      stats = tmp;
      max = score;
    }
  }
}
Beispiel #7
0
void SerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  if (sid >= m_ref_sentences[0].size()) {
    stringstream msg;
    msg << "Sentence id (" << sid << ") not found in reference set";
    throw runtime_error(msg.str());
  }

  string sentence = this->preprocessSentence(text);
  int n_ser = 1; // if we don't find a reference of the same length, the error is 1

  // Check whether the guessed text is equal to the reference 
  // for this line and store it in entry
  vector<int> testtokens;
  TokenizeAndEncode(sentence, testtokens);
  for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
    const sent_t& ref = m_ref_sentences[rid][sid];

    // we can only have a perfect match if the sentence length is equal
    if (testtokens.size() == ref.size()) {
      int errors = 0;
      for (size_t tid = 0; tid < testtokens.size(); tid++) {
	// token mismatch: error 1 w.r.t. this reference; move to next ref.
	if (ref[tid] != testtokens[tid]) { 
	  errors = 1;
	  break;
	}
      }
      if (errors == 0) {
        n_ser = 0;
        break;
      }
    }
  }
  ostringstream stats;
  stats << n_ser << " " << 1; // sentence error (0 or 1), number of sentences (1)
  string stats_str = stats.str();
  entry.set(stats_str);
}