void Data::InitFeatureMap(const string& str) { string buf = str; string substr; string features = ""; string tmp_name = ""; size_t tmp_index = 0; string::size_type loc; char tmp[64]; // for snprintf(); while (!buf.empty()) { getNextPound(buf, substr); // string ending with ":" are skipped, because they are the names of the features if ((loc = substr.find_last_of(":")) != substr.length()-1) { snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index); features.append(tmp); tmp_index++; } else if (substr.find("_") != string::npos) { // ignore sparse feature name and its value getNextPound(buf, substr); } else { // update current feature name tmp_index = 0; tmp_name = substr.substr(0, substr.size() - 1); } } m_feature_data->setFeatureMap(features); }
void Data::InitFeatureMap(const string& str) { string buf = str; string substr; string features = ""; string tmp_name = ""; size_t tmp_index = 0; while (!buf.empty()) { getNextPound(buf, substr); // string ending with "=" are skipped, because they are the names of the features if (!EndsWith(substr, "=")) { stringstream ss; ss << tmp_name << "_" << tmp_index << " "; features.append(ss.str()); tmp_index++; } else if (substr.find("_") != string::npos) { // ignore sparse feature name and its value getNextPound(buf, substr); } else { // update current feature name tmp_index = 0; tmp_name = substr.substr(0, substr.size() - 1); } } m_feature_data->setFeatureMap(features); }
void Data::loadNBest(const string &file) { TRACE_ERR("loading nbest from " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); ScoreStats scoreentry; string line, sentence_index, sentence, feature_str; while (getline(inp, line, '\n')) { if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); getNextPound(line, sentence_index, "|||"); // first field getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field m_scorer->prepareStats(sentence_index, sentence, scoreentry); m_score_data->add(scoreentry, sentence_index); // examine first line for name of features if (!existsFeatureNames()) { InitFeatureMap(feature_str); } AddFeatures(feature_str, sentence_index); } inp.close(); }
// TODO: This is too long. Consider creating a function for // initialization such as Init(). InterpolatedScorer::InterpolatedScorer(const string& name, const string& config) : Scorer(name,config) { // name would be: HAMMING,BLEU or similar string scorers = name; while (scorers.length() > 0) { string scorertype = ""; getNextPound(scorers, scorertype,","); Scorer *scorer = ScorerFactory::getScorer(scorertype,config); m_scorers.push_back(scorer); } if (m_scorers.size() == 0) { throw runtime_error("There are no scorers"); } cerr << "Number of scorers: " << m_scorers.size() << endl; //TODO debug this string wtype = getConfig("weights",""); //Default weights set to uniform ie. if two weights 0.5 each //weights should add to 1 if (wtype.length() == 0) { float weight = 1.0 / m_scorers.size() ; //cout << " Default weights:" << weight << endl; for (size_t i = 0; i < m_scorers.size(); i ++) { m_scorer_weights.push_back(weight); } } else { float tot=0; //cout << "Defined weights:" << endl; while (wtype.length() > 0) { string scoreweight = ""; getNextPound(wtype,scoreweight,"+"); float weight = atof(scoreweight.c_str()); m_scorer_weights.push_back(weight); tot += weight; //cout << " :" << weight ; } //cout << endl; if (tot != float(1)) { // TODO: fix this checking in terms of readability. for (vector<float>::iterator it = m_scorer_weights.begin(); it != m_scorer_weights.end(); ++it) { *it /= tot; } } if (m_scorers.size() != m_scorer_weights.size()) { throw runtime_error("The number of weights does not equal the number of scorers!"); } } cerr << "The weights for the interpolated scorers are: " << endl; for (vector<float>::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) { cerr << *it << " " ; } cerr <<endl; }
void ScoreArray::load(istream* is) { size_t number_of_entries = 0; bool binmode = false; string substring, stringBuf; string::size_type loc; getline(*is, stringBuf); if (!is->good()) { return; } if (!stringBuf.empty()) { if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) { binmode=false; } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) { binmode=true; } else { TRACE_ERR("ERROR: ScoreArray::load(): Wrong header"); return; } getNextPound(stringBuf, substring); getNextPound(stringBuf, substring); m_index = atoi(substring.c_str()); getNextPound(stringBuf, substring); number_of_entries = atoi(substring.c_str()); getNextPound(stringBuf, substring); m_num_scores = atoi(substring.c_str()); getNextPound(stringBuf, substring); m_score_type = substring; } if (binmode) { loadbin(is, number_of_entries); } else { loadtxt(is, number_of_entries); } getline(*is, stringBuf); if (!stringBuf.empty()) { if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) { TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer"); return; } } }
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { stringstream buff; string align = text; string sentence = ""; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } int i = 0; for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) { ScoreStats tempEntry; if ((*itsc)->useAlignment()) { (*itsc)->prepareStats(sid, text, tempEntry); } else { (*itsc)->prepareStats(sid, sentence, tempEntry); } if (i > 0) buff << " "; buff << tempEntry; i++; } //cout << " Scores for interpolated: " << buff << endl; string str = buff.str(); entry.set(str); }
void Data::AddFeatures(const string& str, int sentence_index) { string buf = str; string substr; FeatureStats feature_entry; feature_entry.reset(); while (!buf.empty()) { getNextPound(buf, substr); // no ':' -> feature value that needs to be stored if (!EndsWith(substr, ":")) { feature_entry.add(ConvertStringToFeatureStatsType(substr)); } else if (substr.find("_") != string::npos) { // sparse feature name? store as well string name = substr; getNextPound(buf, substr); feature_entry.addSparse(name, atof(substr.c_str())); } } m_feature_data->add(feature_entry, sentence_index); }
void Data::AddFeatures(const string& str, const string& sentence_index) { string::size_type loc; string buf = str; string substr; FeatureStats feature_entry; feature_entry.reset(); while (!buf.empty()) { getNextPound(buf, substr); // no ':' -> feature value that needs to be stored if ((loc = substr.find_last_of(":")) != substr.length()-1) { feature_entry.add(ConvertStringToFeatureStatsType(substr)); } else if (substr.find("_") != string::npos) { // sparse feature name? store as well string name = substr; getNextPound(buf, substr); feature_entry.addSparse(name, atof(substr.c_str())); m_sparse_flag = true; } } m_feature_data->add(feature_entry, sentence_index); }
void FeatureStats::set(string &theString) { string substring, stringBuf; reset(); while (!theString.empty()) { getNextPound(theString, substring); // regular feature if (substring.find(":") == string::npos) { add(ConvertStringToFeatureStatsType(substring)); } // sparse feature else { size_t separator = substring.find_last_of(":"); addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) ); } } }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //cout << "*******prepareStats" ; //cout << text << endl; //cout << sid << endl; //cout << "Reference0align:" << endl; //m_referencePerms[0][sid].dump(); string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line string standardFormat = Permutation::convertMosesToStandard(align); Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 ostringstream tempStream; tempStream.precision(SCORE_PRECISION); tempStream << distanceValue; string str = tempStream.str(); entry.set(str); //cout << tempStream.str(); }
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config) :StatisticsBasedScorer(distanceMetric,config) { //configure regularisation static string KEY_REFCHOICE = "refchoice"; static string REFCHOICE_AVERAGE = "average"; static string REFCHOICE_CLOSEST = "closest"; string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST); if (refchoice == REFCHOICE_AVERAGE) { m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE; } else if (refchoice == REFCHOICE_CLOSEST) { m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST; } else { throw runtime_error("Unknown reference choice strategy: " + refchoice); } cerr << "Using reference choice strategy: " << refchoice << endl; if (distanceMetric.compare("HAMMING") == 0) { m_distanceMetric = HAMMING_DISTANCE; } else if (distanceMetric.compare("KENDALL") == 0) { m_distanceMetric = KENDALL_DISTANCE; } cerr << "Using permutation distance metric: " << distanceMetric << endl; //Get reference alignments from scconfig refalign option static string KEY_ALIGNMENT_FILES = "refalign"; string refalign = getConfig(KEY_ALIGNMENT_FILES,""); //cout << refalign << endl; if (refalign.length() > 0) { string substring; while (!refalign.empty()) { getNextPound(refalign, substring, "+"); m_referenceAlignments.push_back(substring); } } //Get length of source sentences read in from scconfig source option // this is essential for extractor but unneccesary for mert executable static string KEY_SOURCE_FILE = "source"; string sourceFile = getConfig(KEY_SOURCE_FILE,""); if (sourceFile.length() > 0) { cerr << "Loading source sentence lengths from " << sourceFile << endl; ifstream sourcein(sourceFile.c_str()); if (!sourcein) { throw runtime_error("Unable to open: " + sourceFile); } string line; while (getline(sourcein,line)) { size_t wordNumber = 0; string word; while(!line.empty()) { getNextPound(line, word, " "); wordNumber++; } m_sourceLengths.push_back(wordNumber); } sourcein.close(); } }
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { //bool debug= (verboselevel()>3); // TODO: fix verboselevel() bool debug=false; if (debug) { cout << "*******prepareStats" ; cout << text << endl; cout << sid << endl; cout << "Reference0align:" << endl; m_referencePerms[0][sid].dump(); } string sentence = ""; string align = text; size_t alignmentData = text.find("|||"); //Get sentence and alignment parts if(alignmentData != string::npos) { getNextPound(align,sentence, "|||"); } else { align = text; } int translationLength = getNumberWords(sentence); //A vector of Permutations for each sentence vector< vector<Permutation> > nBestPerms; float distanceValue; //need to create permutations for each nbest line //here we check if the alignments extracted from the nbest are phrase-based or word-based, in which case no conversion is needed bool isWordAlignment=true; string alignCopy = align; string align1; getNextPound(alignCopy,align1," "); if (align1.length() > 0) { size_t phraseDelimeter = align1.find("="); if(phraseDelimeter!= string::npos) isWordAlignment=false; } string standardFormat = align; if(!isWordAlignment) standardFormat= Permutation::convertMosesToStandard(align); if (debug) { cerr << "Nbest alignment: " << align << endl; cerr << "-->std alignment: " << standardFormat << endl; } Permutation perm(standardFormat, m_sourceLengths[sid],translationLength); //perm.dump(); if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) { float total = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric); total += dist; //cout << "Ref number: " << i << " distance: " << dist << endl; } float mean = (float)total/m_referencePerms.size(); //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl; distanceValue = mean; } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST) { float max_val = 0; for (size_t i = 0; i < m_referencePerms.size(); ++i) { //look for the closest reference float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric); //cout << "Ref number: " << i << " distance: " << value << endl; if (value > max_val) { max_val = value; } } distanceValue = max_val; //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl; } else { throw runtime_error("Unsupported reflength strategy"); } //SCOREROUT eg: 0.04546 distanceValue*=SCORE_MULTFACT; //SCOREROUT eg: 4546 to transform float into integer ostringstream tempStream; tempStream.precision(0); // decimal precision not needed as score was multiplied per SCORE_MULTFACT tempStream << std::fixed << distanceValue << " 1"; //use for final normalization over the amount of test sentences string str = tempStream.str(); entry.set(str); //cout << distanceValue << "=" << distanceValue << " (str:" << tempStream.str() << ")" << endl; }
void Data::loadnbest(const std::string &file) { TRACE_ERR("loading nbest from " << file << std::endl); FeatureStats featentry; ScoreStats scoreentry; std::string sentence_index; inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); std::string substring, subsubstring, stringBuf; std::string theSentence; std::string::size_type loc; while (getline(inp,stringBuf,'\n')) { if (stringBuf.empty()) continue; // TRACE_ERR("stringBuf: " << stringBuf << std::endl); getNextPound(stringBuf, substring, "|||"); //first field sentence_index = substring; getNextPound(stringBuf, substring, "|||"); //second field theSentence = substring; // adding statistics for error measures featentry.reset(); scoreentry.clear(); theScorer->prepareStats(sentence_index, theSentence, scoreentry); scoredata->add(scoreentry, sentence_index); getNextPound(stringBuf, substring, "|||"); //third field // examine first line for name of features if (!existsFeatureNames()) { std::string stringsupport=substring; std::string features=""; std::string tmpname=""; size_t tmpidx=0; while (!stringsupport.empty()) { // TRACE_ERR("Decompounding: " << substring << std::endl); getNextPound(stringsupport, subsubstring); // string ending with ":" are skipped, because they are the names of the features if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { features+=tmpname+"_"+stringify(tmpidx)+" "; tmpidx++; } // ignore sparse feature name else if (subsubstring.find("_") != string::npos) { // also ignore its value getNextPound(stringsupport, subsubstring); } // update current feature name else { tmpidx=0; tmpname=subsubstring.substr(0,subsubstring.size() - 1); } } featdata->setFeatureMap(features); } // adding features while (!substring.empty()) { // TRACE_ERR("Decompounding: " << substring << std::endl); getNextPound(substring, subsubstring); // no ':' -> feature value that needs to be stored if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) { featentry.add(ATOFST(subsubstring.c_str())); } // sparse feature name? store as well else if (subsubstring.find("_") != string::npos) { std::string name = subsubstring; getNextPound(substring, subsubstring); featentry.addSparse( name, atof(subsubstring.c_str()) ); _sparse_flag = true; } } //cerr << "number of sparse features: " << featentry.getSparse().size() << endl; featdata->add(featentry,sentence_index); } inp.close(); }
void DataAsiya::loadNBest(const string &file) { TRACE_ERR("loading nbest from DataAsiya " << file << endl); inputfilestream inp(file); // matches a stream with a file. Opens the file if (!inp.good()) throw runtime_error("Unable to open: " + file); ScoreStats scoreentry; string line, sentence_index, sentence, feature_str, alignment; AsiyaScorer* a_scorer = dynamic_cast<AsiyaScorer*>(m_scorer); /*todo. change this loop. instead of obtaining the score for each sentence, obtain all the scores at once!*/ while (getline(inp, line, '\n')) { if (line.empty()) continue; // adding statistics for error measures scoreentry.clear(); getNextPound(line, sentence_index, "|||"); // first field getNextPound(line, sentence, "|||"); // second field getNextPound(line, feature_str, "|||"); // third field if (line.length() > 0) { string temp; getNextPound(line, temp, "|||"); //fourth field sentence score if (line.length() > 0) { getNextPound(line, alignment, "|||"); //fourth field only there if alignment scorer } } //TODO check alignment exists if scorers need it if (a_scorer->useAlignment()) { sentence += "|||"; sentence += alignment; } // prepare stats gets all the scores for sentence_i of sentence_index // a_scorer->addCandidateSentence(sentence_index, sentence); a_scorer->prepareStats(atoi(sentence_index.c_str()), sentence, scoreentry); // examine first line for name of features if (!existsFeatureNames()) { InitFeatureMap(feature_str); } AddFeatures(feature_str, atoi(sentence_index.c_str())); } a_scorer->doScoring(); // TRACE_ERR("before getAllScoreStats" << endl); std::vector<std::vector <ScoreStats> > allScoreStats = a_scorer->getAllScoreStats(); for (int i = 0; i < allScoreStats.size(); ++i) for(int j = 0; j < allScoreStats[i].size(); ++j) { stringstream ss; ss << i; m_score_data->add(allScoreStats[i][j], atoi(ss.str().c_str())); // TRACE_ERR("allScoreStats[" << i << "].size() " << allScoreStats[i].size() << " " << allScoreStats[i][j] << endl); } inp.close(); // a_scorer->doScoring( m_score_data ); //score each sentence //a_scorer->prepareStats(sentence_index, sentence, scoreentry); // save the score for previous sentence. Do it aling with previous function //m_score_data->add(scoreentry, sentence_index); }