void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  cout << "*******setReferenceFiles" << endl;
  //make sure reference data is clear
  m_referencePerms.clear();

  vector< vector< int> > targetLengths;
  //Just getting target length from reference text file
  for (size_t i = 0; i < referenceFiles.size(); ++i) {
    vector <int> lengths;
    cout << "Loading reference from " << referenceFiles[i] << endl;
    ifstream refin(referenceFiles[i].c_str());
    if (!refin) {
      cerr << "Unable to open: " << referenceFiles[i] << endl;
      throw runtime_error("Unable to open alignment file");
    }
    string line;
    while (getline(refin,line)) {
      int count = getNumberWords(line);
      lengths.push_back(count);
    }
    targetLengths.push_back(lengths);
  }

  //load reference data
  //NOTE ignoring normal reference file, only using previously saved alignment reference files
  for (size_t i = 0; i < m_referenceAlignments.size(); ++i) {
    vector<Permutation> referencePerms;
    cout << "Loading reference from " << m_referenceAlignments[i] << endl;
    ifstream refin(m_referenceAlignments[i].c_str());
    if (!refin) {
      cerr << "Unable to open: " << m_referenceAlignments[i] << endl;
      throw runtime_error("Unable to open alignment file");
    }
    string line;
    size_t sid = 0; //sentence counter
    while (getline(refin,line)) {
      //cout << line << endl;

      //Line needs to be of the format: 0-0 1-1 1-2 etc source-target
      Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]);
      //perm.dump();
      referencePerms.push_back(perm);
      //check the source sentence length is the same for previous file
      if (perm.getLength() !=  m_sourceLengths[sid]) {
        cerr << "Permutation Length: " << perm.getLength() << endl;
        cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl;
        throw runtime_error("Source sentence lengths not the same: ");
      }

      sid++;
    }
    m_referencePerms.push_back(referencePerms);
  }
}
void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  //make sure reference data is clear
  m_ref_sentences.clear();

  //load reference data
  for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
    ifstream refin(referenceFiles[rid].c_str());
    if (!refin) {
      throw runtime_error("Unable to open: " + referenceFiles[rid]);
    }
    m_ref_sentences.push_back(vector<sentence_t>());
    string line;
    while (getline(refin,line)) {
      line = preprocessSentence(line);

      str_sentence_t sentence;
      splitSentence(line, sentence);

      sentence_t encodedSentence;
      encodeSentence(sentence, encodedSentence);

      m_ref_sentences[rid].push_back(encodedSentence);
    }
  }
}
Example #3
0
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
  //make sure reference data is clear
  m_ref_counts.reset();
  m_ref_lengths.clear();
  ClearEncoder();

  //load reference data
  for (size_t i = 0; i < referenceFiles.size(); ++i) {
    TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
    ifstream refin(referenceFiles[i].c_str());
    if (!refin) {
      throw runtime_error("Unable to open: " + referenceFiles[i]);
    }
    string line;
    size_t sid = 0; //sentence counter
    while (getline(refin,line)) {
      //cerr << line << endl;
      if (i == 0) {
        counts_t *counts = new counts_t; //these get leaked
        m_ref_counts.push_back(counts);
        vector<size_t> lengths;
        m_ref_lengths.push_back(lengths);
      }
      if (m_ref_counts.size() <= sid) {
        throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
      }
      counts_t counts;
      size_t length = countNgrams(line,counts,kLENGTH);
      //for any counts larger than those already there, merge them in
      for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
        counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
        int oldcount = 0;
        if (oldcount_it != m_ref_counts[sid]->end()) {
          oldcount = oldcount_it->second;
        }
        int newcount = ci->second;
        if (newcount > oldcount) {
          m_ref_counts[sid]->operator[](ci->first) = newcount;
        }
      }
      //add in the length
      m_ref_lengths[sid].push_back(length);
      if (sid > 0 && sid % 100 == 0) {
        TRACE_ERR(".");
      }
      ++sid;
    }
    TRACE_ERR(endl);
  }
}