void PermutationScorer::setReferenceFiles(const vector<string>& referenceFiles) { cout << "*******setReferenceFiles" << endl; //make sure reference data is clear m_referencePerms.clear(); vector< vector< int> > targetLengths; //Just getting target length from reference text file for (size_t i = 0; i < referenceFiles.size(); ++i) { vector <int> lengths; cout << "Loading reference from " << referenceFiles[i] << endl; ifstream refin(referenceFiles[i].c_str()); if (!refin) { cerr << "Unable to open: " << referenceFiles[i] << endl; throw runtime_error("Unable to open alignment file"); } string line; while (getline(refin,line)) { int count = getNumberWords(line); lengths.push_back(count); } targetLengths.push_back(lengths); } //load reference data //NOTE ignoring normal reference file, only using previously saved alignment reference files for (size_t i = 0; i < m_referenceAlignments.size(); ++i) { vector<Permutation> referencePerms; cout << "Loading reference from " << m_referenceAlignments[i] << endl; ifstream refin(m_referenceAlignments[i].c_str()); if (!refin) { cerr << "Unable to open: " << m_referenceAlignments[i] << endl; throw runtime_error("Unable to open alignment file"); } string line; size_t sid = 0; //sentence counter while (getline(refin,line)) { //cout << line << endl; //Line needs to be of the format: 0-0 1-1 1-2 etc source-target Permutation perm(line, m_sourceLengths[sid],targetLengths[i][sid]); //perm.dump(); referencePerms.push_back(perm); //check the source sentence length is the same for previous file if (perm.getLength() != m_sourceLengths[sid]) { cerr << "Permutation Length: " << perm.getLength() << endl; cerr << "Source length: " << m_sourceLengths[sid] << " for sid " << sid << endl; throw runtime_error("Source sentence lengths not the same: "); } sid++; } m_referencePerms.push_back(referencePerms); } }
void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles) { //make sure reference data is clear m_ref_sentences.clear(); //load reference data for (size_t rid = 0; rid < referenceFiles.size(); ++rid) { ifstream refin(referenceFiles[rid].c_str()); if (!refin) { throw runtime_error("Unable to open: " + referenceFiles[rid]); } m_ref_sentences.push_back(vector<sentence_t>()); string line; while (getline(refin,line)) { line = preprocessSentence(line); str_sentence_t sentence; splitSentence(line, sentence); sentence_t encodedSentence; encodeSentence(sentence, encodedSentence); m_ref_sentences[rid].push_back(encodedSentence); } } }
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles) { //make sure reference data is clear m_ref_counts.reset(); m_ref_lengths.clear(); ClearEncoder(); //load reference data for (size_t i = 0; i < referenceFiles.size(); ++i) { TRACE_ERR("Loading reference from " << referenceFiles[i] << endl); ifstream refin(referenceFiles[i].c_str()); if (!refin) { throw runtime_error("Unable to open: " + referenceFiles[i]); } string line; size_t sid = 0; //sentence counter while (getline(refin,line)) { //cerr << line << endl; if (i == 0) { counts_t *counts = new counts_t; //these get leaked m_ref_counts.push_back(counts); vector<size_t> lengths; m_ref_lengths.push_back(lengths); } if (m_ref_counts.size() <= sid) { throw runtime_error("File " + referenceFiles[i] + " has too many sentences"); } counts_t counts; size_t length = countNgrams(line,counts,kLENGTH); //for any counts larger than those already there, merge them in for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) { counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first); int oldcount = 0; if (oldcount_it != m_ref_counts[sid]->end()) { oldcount = oldcount_it->second; } int newcount = ci->second; if (newcount > oldcount) { m_ref_counts[sid]->operator[](ci->first) = newcount; } } //add in the length m_ref_lengths[sid].push_back(length); if (sid > 0 && sid % 100 == 0) { TRACE_ERR("."); } ++sid; } TRACE_ERR(endl); } }