void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_references.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } NgramCounts testcounts; // stats for this line vector<ScoreStatsType> stats(kBleuNgramOrder * 2); string sentence = preprocessSentence(text); const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true); const int reference_len = CalcReferenceLength(sid, length); stats.push_back(reference_len); //precision on each ngram type for (NgramCounts::const_iterator testcounts_it = testcounts.begin(); testcounts_it != testcounts.end(); ++testcounts_it) { const NgramCounts::Value guess = testcounts_it->second; const size_t len = testcounts_it->first.size(); NgramCounts::Value correct = 0; NgramCounts::Value v = 0; if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) { correct = min(v, guess); } stats[len * 2 - 2] += correct; stats[len * 2 - 1] += guess; } entry.set(stats); }
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts, unsigned int n, bool is_testing) { assert(n > 0); vector<int> encoded_tokens; // When performing tokenization of a hypothesis translation, we don't have // to update the Scorer's word vocabulary. However, the tokenization of // reference translations requires modifying the vocabulary, which means // this procedure might be slower than the tokenization the hypothesis // translation. if (is_testing) { TokenizeAndEncodeTesting(line, encoded_tokens); } else { TokenizeAndEncode(line, encoded_tokens); } const size_t len = encoded_tokens.size(); vector<int> ngram; for (size_t k = 1; k <= n; ++k) { //ngram order longer than sentence - no point if (k > len) { continue; } for (size_t i = 0; i < len - k + 1; ++i) { ngram.clear(); ngram.reserve(len); for (size_t j = i; j < i+k && j < len; ++j) { ngram.push_back(encoded_tokens[j]); } counts.Add(ngram); } } return len; }
void BleuScorer::DumpCounts(ostream* os, const NgramCounts& counts) const { for (NgramCounts::const_iterator it = counts.begin(); it != counts.end(); ++it) { *os << "("; const NgramCounts::Key& keys = it->first; for (size_t i = 0; i < keys.size(); ++i) { if (i != 0) { *os << " "; } *os << keys[i]; } *os << ") : " << it->second << ", "; } *os << endl; }
bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) { if (is == NULL) return false; string line; size_t sid = 0; while (getline(*is, line)) { line = preprocessSentence(line); if (file_id == 0) { Reference* ref = new Reference; m_references.push_back(ref); // Take ownership of the Reference object. } if (m_references.size() <= sid) { cerr << "Reference " << file_id << "has too many sentences." << endl; return false; } NgramCounts counts; size_t length = CountNgrams(line, counts, kBleuNgramOrder); //for any counts larger than those already there, merge them in for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) { const NgramCounts::Key& ngram = ci->first; const NgramCounts::Value newcount = ci->second; NgramCounts::Value oldcount = 0; m_references[sid]->get_counts()->Lookup(ngram, &oldcount); if (newcount > oldcount) { m_references[sid]->get_counts()->operator[](ngram) = newcount; } } //add in the length m_references[sid]->push_back(length); if (sid > 0 && sid % 100 == 0) { TRACE_ERR("."); } ++sid; } return true; }