void PerScorer::setReferenceFiles(const vector<string>& referenceFiles) { // For each line in the reference file, create a multiset of // the word ids. if (referenceFiles.size() != 1) { throw runtime_error("PER only supports a single reference"); } m_ref_tokens.clear(); m_ref_lengths.clear(); ifstream in(referenceFiles[0].c_str()); if (!in) { throw runtime_error("Unable to open " + referenceFiles[0]); } string line; int sid = 0; while (getline(in,line)) { line = this->preprocessSentence(line); vector<int> tokens; TokenizeAndEncode(line, tokens); m_ref_tokens.push_back(multiset<int>()); for (size_t i = 0; i < tokens.size(); ++i) { m_ref_tokens.back().insert(tokens[i]); } m_ref_lengths.push_back(tokens.size()); if (sid > 0 && sid % 100 == 0) { TRACE_ERR("."); } ++sid; } TRACE_ERR(endl); }
void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_lengths.size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); // Calculate correct, output_length and ref_length for // the line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); multiset<int> testtokens_all(testtokens.begin(),testtokens.end()); set<int> testtokens_unique(testtokens.begin(),testtokens.end()); int correct = 0; for (set<int>::iterator i = testtokens_unique.begin(); i != testtokens_unique.end(); ++i) { int token = *i; correct += min(m_ref_tokens[sid].count(token), testtokens_all.count(token)); } ostringstream stats; stats << correct << " " << testtokens.size() << " " << m_ref_lengths[sid] << " " ; string stats_str = stats.str(); entry.set(stats_str); }
size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts, unsigned int n, bool is_testing) { assert(n > 0); vector<int> encoded_tokens; // When performing tokenization of a hypothesis translation, we don't have // to update the Scorer's word vocabulary. However, the tokenization of // reference translations requires modifying the vocabulary, which means // this procedure might be slower than the tokenization the hypothesis // translation. if (is_testing) { TokenizeAndEncodeTesting(line, encoded_tokens); } else { TokenizeAndEncode(line, encoded_tokens); } const size_t len = encoded_tokens.size(); vector<int> ngram; for (size_t k = 1; k <= n; ++k) { //ngram order longer than sentence - no point if (k > len) { continue; } for (size_t i = 0; i < len - k + 1; ++i) { ngram.clear(); ngram.reserve(len); for (size_t j = i; j < i+k && j < len; ++j) { ngram.push_back(encoded_tokens[j]); } counts.Add(ngram); } } return len; }
size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n) { vector<int> encoded_tokens; //cerr << line << endl; TokenizeAndEncode(line, encoded_tokens); //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," ")); //cerr << endl; for (size_t k = 1; k <= n; ++k) { //ngram order longer than sentence - no point if (k > encoded_tokens.size()) { continue; } for (size_t i = 0; i < encoded_tokens.size()-k+1; ++i) { vector<int> ngram; for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) { ngram.push_back(encoded_tokens[j]); } int count = 1; counts_iterator oldcount = counts.find(ngram); if (oldcount != counts.end()) { count = (oldcount->second) + 1; } //cerr << count << endl; counts[ngram] = count; //cerr << endl; } } //cerr << "counted ngrams" << endl; //dump_counts(counts); return encoded_tokens.size(); }
void SerScorer::setReferenceFiles(const vector<string>& referenceFiles) { //make sure reference data is clear m_ref_sentences.clear(); //load reference data for (size_t rid = 0; rid < referenceFiles.size(); ++rid) { ifstream refin(referenceFiles[rid].c_str()); if (!refin) { throw runtime_error("Unable to open: " + referenceFiles[rid]); } m_ref_sentences.push_back(vector<sent_t>()); string line; while (getline(refin,line)) { line = this->preprocessSentence(line); sent_t encoded; TokenizeAndEncode(line, encoded); m_ref_sentences[rid].push_back(encoded); } } }
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats) { sent_t cand; TokenizeAndEncode(text, cand); float max = -2; vector<ScoreStatsType> tmp; for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const sent_t& ref = m_ref_sentences[rid][sid]; tmp.clear(); computeCD(cand, ref, tmp); int score = calculateScore(tmp); if (rid == 0) { stats = tmp; max = score; } else if (score > max) { stats = tmp; max = score; } } }
void SerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { if (sid >= m_ref_sentences[0].size()) { stringstream msg; msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } string sentence = this->preprocessSentence(text); int n_ser = 1; // if we don't find a reference of the same length, the error is 1 // Check whether the guessed text is equal to the reference // for this line and store it in entry vector<int> testtokens; TokenizeAndEncode(sentence, testtokens); for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) { const sent_t& ref = m_ref_sentences[rid][sid]; // we can only have a perfect match if the sentence length is equal if (testtokens.size() == ref.size()) { int errors = 0; for (size_t tid = 0; tid < testtokens.size(); tid++) { // token mismatch: error 1 w.r.t. this reference; move to next ref. if (ref[tid] != testtokens[tid]) { errors = 1; break; } } if (errors == 0) { n_ser = 0; break; } } } ostringstream stats; stats << n_ser << " " << 1; // sentence error (0 or 1), number of sentences (1) string stats_str = stats.str(); entry.set(stats_str); }