Esempio n. 1
0
void Data::InitFeatureMap(const string& str) {
  string buf = str;
  string substr;
  string features = "";
  string tmp_name = "";
  size_t tmp_index = 0;
  string::size_type loc;
  char tmp[64];                         // for snprintf();

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // string ending with ":" are skipped, because they are the names of the features
    if ((loc = substr.find_last_of(":")) != substr.length()-1) {
      snprintf(tmp, sizeof(tmp), "%s_%lu ", tmp_name.c_str(), tmp_index);
      features.append(tmp);

      tmp_index++;
    } else if (substr.find("_") != string::npos) {
      // ignore sparse feature name and its value
      getNextPound(buf, substr);
    } else {                              // update current feature name
      tmp_index = 0;
      tmp_name = substr.substr(0, substr.size() - 1);
    }
  }
  m_feature_data->setFeatureMap(features);
}
Esempio n. 2
0
void Data::InitFeatureMap(const string& str)
{
  string buf = str;
  string substr;
  string features = "";
  string tmp_name = "";
  size_t tmp_index = 0;

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // string ending with "=" are skipped, because they are the names of the features
    if (!EndsWith(substr, "=")) {
      stringstream ss;
      ss << tmp_name << "_" << tmp_index << " ";
      features.append(ss.str());

      tmp_index++;
    } else if (substr.find("_") != string::npos) {
      // ignore sparse feature name and its value
      getNextPound(buf, substr);
    } else {                              // update current feature name
      tmp_index = 0;
      tmp_name = substr.substr(0, substr.size() - 1);
    }
  }
  m_feature_data->setFeatureMap(features);
}
Esempio n. 3
0
void Data::loadNBest(const string &file)
{
  TRACE_ERR("loading nbest from " << file << endl);
  inputfilestream inp(file); // matches a stream with a file. Opens the file
  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  ScoreStats scoreentry;
  string line, sentence_index, sentence, feature_str;

  while (getline(inp, line, '\n')) {
    if (line.empty()) continue;
    // adding statistics for error measures
    scoreentry.clear();

    getNextPound(line, sentence_index, "|||"); // first field
    getNextPound(line, sentence, "|||");       // second field
    getNextPound(line, feature_str, "|||");    // third field

    m_scorer->prepareStats(sentence_index, sentence, scoreentry);
    m_score_data->add(scoreentry, sentence_index);

    // examine first line for name of features
    if (!existsFeatureNames()) {
      InitFeatureMap(feature_str);
    }
    AddFeatures(feature_str, sentence_index);
  }
  inp.close();
}
Esempio n. 4
0
// TODO: This is too long. Consider creating a function for
// initialization such as Init().
InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
    : Scorer(name,config)
{
  // name would be: HAMMING,BLEU or similar
  string scorers = name;
  while (scorers.length() > 0) {
    string scorertype = "";
    getNextPound(scorers, scorertype,",");
    Scorer *scorer = ScorerFactory::getScorer(scorertype,config);
    m_scorers.push_back(scorer);
  }
  if (m_scorers.size() == 0) {
    throw runtime_error("There are no scorers");
  }
  cerr << "Number of scorers: " << m_scorers.size() << endl;

  //TODO debug this
  string wtype = getConfig("weights","");
  //Default weights set to uniform ie. if two weights 0.5 each
  //weights should add to 1
  if (wtype.length() == 0) {
    float weight = 1.0 / m_scorers.size() ;
    //cout << " Default weights:" << weight << endl;
    for (size_t i = 0; i < m_scorers.size(); i ++) {
      m_scorer_weights.push_back(weight);
    }
  } else {
    float tot=0;
    //cout << "Defined weights:"  << endl;
    while (wtype.length() > 0) {
      string scoreweight = "";
      getNextPound(wtype,scoreweight,"+");
      float weight = atof(scoreweight.c_str());
      m_scorer_weights.push_back(weight);
      tot += weight;
      //cout << " :" << weight ;
    }
    //cout << endl;
    if (tot != float(1)) { // TODO: fix this checking in terms of readability.
      for (vector<float>::iterator it = m_scorer_weights.begin();
           it != m_scorer_weights.end(); ++it) {
        *it /= tot;
      }
    }

    if (m_scorers.size() != m_scorer_weights.size()) {
      throw runtime_error("The number of weights does not equal the number of scorers!");
    }
  }
  cerr << "The weights for the interpolated scorers are: " << endl;
  for (vector<float>::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) {
    cerr << *it << " " ;
  }
  cerr <<endl;
}
Esempio n. 5
0
void ScoreArray::load(istream* is)
{
  size_t number_of_entries = 0;
  bool binmode = false;

  string substring, stringBuf;
  string::size_type loc;

  getline(*is, stringBuf);
  if (!is->good()) {
    return;
  }

  if (!stringBuf.empty()) {
    if ((loc = stringBuf.find(SCORES_TXT_BEGIN)) == 0) {
      binmode=false;
    } else if ((loc = stringBuf.find(SCORES_BIN_BEGIN)) == 0) {
      binmode=true;
    } else {
      TRACE_ERR("ERROR: ScoreArray::load(): Wrong header");
      return;
    }
    getNextPound(stringBuf, substring);
    getNextPound(stringBuf, substring);
    m_index = atoi(substring.c_str());
    getNextPound(stringBuf, substring);
    number_of_entries = atoi(substring.c_str());
    getNextPound(stringBuf, substring);
    m_num_scores = atoi(substring.c_str());
    getNextPound(stringBuf, substring);
    m_score_type = substring;
  }

  if (binmode) {
    loadbin(is, number_of_entries);
  } else {
    loadtxt(is, number_of_entries);
  }

  getline(*is, stringBuf);
  if (!stringBuf.empty()) {
    if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
        (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
      TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
      return;
    }
  }
}
Esempio n. 6
0
void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  stringstream buff;
  string align = text;
  string sentence = "";
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  }

  int i = 0;
  for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin(); itsc != m_scorers.end(); ++itsc) {
    ScoreStats tempEntry;
    if ((*itsc)->useAlignment()) {
      (*itsc)->prepareStats(sid, text, tempEntry);
    }
    else {
      (*itsc)->prepareStats(sid, sentence, tempEntry);
    }
    if (i > 0) buff <<  " ";
    buff << tempEntry;
    i++;
  }
  //cout << " Scores for interpolated: " << buff << endl;
  string str = buff.str();
  entry.set(str);
}
void Data::AddFeatures(const string& str,
                       int sentence_index) {
  string buf = str;
  string substr;
  FeatureStats feature_entry;
  feature_entry.reset();

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // no ':' -> feature value that needs to be stored
    if (!EndsWith(substr, ":")) {
      feature_entry.add(ConvertStringToFeatureStatsType(substr));
    } else if (substr.find("_") != string::npos) {
      // sparse feature name? store as well
      string name = substr;
      getNextPound(buf, substr);
      feature_entry.addSparse(name, atof(substr.c_str()));
    }
  }
  m_feature_data->add(feature_entry, sentence_index);
}
Esempio n. 8
0
void Data::AddFeatures(const string& str,
                       const string& sentence_index) {
  string::size_type loc;
  string buf = str;
  string substr;
  FeatureStats feature_entry;
  feature_entry.reset();

  while (!buf.empty()) {
    getNextPound(buf, substr);

    // no ':' -> feature value that needs to be stored
    if ((loc = substr.find_last_of(":")) != substr.length()-1) {
      feature_entry.add(ConvertStringToFeatureStatsType(substr));
    } else if (substr.find("_") != string::npos) {
      // sparse feature name? store as well
      string name = substr;
      getNextPound(buf, substr);
      feature_entry.addSparse(name, atof(substr.c_str()));
      m_sparse_flag = true;
    }
  }
  m_feature_data->add(feature_entry, sentence_index);
}
Esempio n. 9
0
void FeatureStats::set(string &theString)
{
  string substring, stringBuf;
  reset();

  while (!theString.empty()) {
    getNextPound(theString, substring);
    // regular feature
    if (substring.find(":") == string::npos) {
      add(ConvertStringToFeatureStatsType(substring));
    }
    // sparse feature
    else {
      size_t separator = substring.find_last_of(":");
      addSparse(substring.substr(0,separator), atof(substring.substr(separator+1).c_str()) );
    }
  }
}
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  //cout << "*******prepareStats" ;
  //cout << text << endl;
  //cout << sid << endl;
  //cout << "Reference0align:" << endl;
  //m_referencePerms[0][sid].dump();


  string sentence = "";
  string align = text;
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  } else {
    align = text;
  }
  int translationLength = getNumberWords(sentence);


  //A vector of Permutations for each sentence
  vector< vector<Permutation> > nBestPerms;
  float distanceValue;

  //need to create permutations for each nbest line
  string standardFormat = Permutation::convertMosesToStandard(align);
  Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
  //perm.dump();

  if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
    float total = 0;
    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      total += dist;
      //cout << "Ref number: " << i << " distance: " << dist << endl;
    }
    float mean = (float)total/m_referencePerms.size();
    //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
    distanceValue = mean;
  } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST)  {
    float max_val = 0;

    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      //look for the closest reference
      float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      //cout << "Ref number: " << i << " distance: " << value << endl;
      if (value > max_val) {
        max_val = value;
      }
    }
    distanceValue = max_val;
    //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
  } else {
    throw runtime_error("Unsupported reflength strategy");
  }

  //SCOREROUT eg: 0.04546
  ostringstream tempStream;
  tempStream.precision(SCORE_PRECISION);
  tempStream << distanceValue;
  string str = tempStream.str();
  entry.set(str);

  //cout << tempStream.str();
}
PermutationScorer::PermutationScorer(const string &distanceMetric, const string &config)
  :StatisticsBasedScorer(distanceMetric,config)
{
  //configure regularisation

  static string KEY_REFCHOICE = "refchoice";
  static string REFCHOICE_AVERAGE = "average";
  static string REFCHOICE_CLOSEST = "closest";

  string refchoice = getConfig(KEY_REFCHOICE,REFCHOICE_CLOSEST);
  if (refchoice == REFCHOICE_AVERAGE) {
    m_refChoiceStrategy = REFERENCE_CHOICE_AVERAGE;
  } else if (refchoice == REFCHOICE_CLOSEST) {
    m_refChoiceStrategy = REFERENCE_CHOICE_CLOSEST;
  } else {
    throw runtime_error("Unknown reference choice strategy: " + refchoice);
  }
  cerr << "Using reference choice strategy: " << refchoice << endl;

  if (distanceMetric.compare("HAMMING") == 0) {
    m_distanceMetric = HAMMING_DISTANCE;
  } else if (distanceMetric.compare("KENDALL") == 0) {
    m_distanceMetric = KENDALL_DISTANCE;
  }
  cerr << "Using permutation distance metric: " << distanceMetric << endl;

  //Get reference alignments from scconfig refalign option
  static string KEY_ALIGNMENT_FILES = "refalign";
  string refalign = getConfig(KEY_ALIGNMENT_FILES,"");
  //cout << refalign << endl;
  if (refalign.length() > 0) {
    string substring;
    while (!refalign.empty()) {
      getNextPound(refalign, substring, "+");
      m_referenceAlignments.push_back(substring);
    }
  }

  //Get length of source sentences read in from scconfig source option
  // this is essential for extractor but unneccesary for mert executable
  static string KEY_SOURCE_FILE = "source";
  string sourceFile = getConfig(KEY_SOURCE_FILE,"");
  if (sourceFile.length() > 0) {
    cerr << "Loading source sentence lengths from " << sourceFile << endl;
    ifstream sourcein(sourceFile.c_str());
    if (!sourcein) {
      throw runtime_error("Unable to open: " + sourceFile);
    }
    string line;
    while (getline(sourcein,line)) {
      size_t wordNumber = 0;
      string word;
      while(!line.empty()) {
        getNextPound(line, word, " ");
        wordNumber++;
      }
      m_sourceLengths.push_back(wordNumber);
    }
    sourcein.close();
  }
}
Esempio n. 12
0
void PermutationScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
  //bool debug= (verboselevel()>3); // TODO: fix verboselevel()
  bool debug=false;
  if (debug) {
    cout << "*******prepareStats" ;
    cout << text << endl;
    cout << sid << endl;
    cout << "Reference0align:" << endl;
    m_referencePerms[0][sid].dump();
  }

  string sentence = "";
  string align = text;
  size_t alignmentData = text.find("|||");
  //Get sentence and alignment parts
  if(alignmentData != string::npos) {
    getNextPound(align,sentence, "|||");
  } else {
    align = text;
  }
  int translationLength = getNumberWords(sentence);


  //A vector of Permutations for each sentence
  vector< vector<Permutation> > nBestPerms;
  float distanceValue;

  //need to create permutations for each nbest line
  //here we check if the alignments extracted from the nbest are phrase-based or word-based, in which case no conversion is needed
  bool isWordAlignment=true;
  string alignCopy = align;
  string align1;
  getNextPound(alignCopy,align1," ");
  if (align1.length() > 0) {
    size_t phraseDelimeter = align1.find("=");
    if(phraseDelimeter!= string::npos)
      isWordAlignment=false;
  }
  string standardFormat = align;
  if(!isWordAlignment)
    standardFormat= Permutation::convertMosesToStandard(align);

  if (debug) {
    cerr << "Nbest alignment:  " << align << endl;
    cerr << "-->std alignment: " << standardFormat << endl;
  }

  Permutation perm(standardFormat, m_sourceLengths[sid],translationLength);
  //perm.dump();

  if (m_refChoiceStrategy == REFERENCE_CHOICE_AVERAGE) {
    float total = 0;
    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      float dist = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      total += dist;
      //cout << "Ref number: " << i << " distance: " << dist << endl;
    }
    float mean = (float)total/m_referencePerms.size();
    //cout << "MultRef strategy AVERAGE: total " << total << " mean " << mean << " number " << m_referencePerms.size() << endl;
    distanceValue = mean;
  } else if (m_refChoiceStrategy == REFERENCE_CHOICE_CLOSEST)  {
    float max_val = 0;

    for (size_t i = 0; i < m_referencePerms.size(); ++i) {
      //look for the closest reference
      float value = perm.distance(m_referencePerms[i][sid], m_distanceMetric);
      //cout << "Ref number: " << i << " distance: " << value << endl;
      if (value > max_val) {
        max_val = value;
      }
    }
    distanceValue = max_val;
    //cout << "MultRef strategy CLOSEST: max_val " << distanceValue << endl;
  } else {
    throw runtime_error("Unsupported reflength strategy");
  }

  //SCOREROUT eg: 0.04546
  distanceValue*=SCORE_MULTFACT; //SCOREROUT eg: 4546 to transform float into integer
  ostringstream tempStream;
  tempStream.precision(0);	// decimal precision not needed as score was multiplied per SCORE_MULTFACT
  tempStream << std::fixed << distanceValue << " 1"; //use for final normalization over the amount of test sentences
  string str = tempStream.str();
  entry.set(str);

//cout << distanceValue << "=" << distanceValue << " (str:" << tempStream.str() << ")" << endl;
}
Esempio n. 13
0
void Data::loadnbest(const std::string &file)
{
  TRACE_ERR("loading nbest from " << file << std::endl);

  FeatureStats featentry;
  ScoreStats scoreentry;
  std::string sentence_index;

  inputfilestream inp(file); // matches a stream with a file. Opens the file

  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  std::string substring, subsubstring, stringBuf;
  std::string theSentence;
  std::string::size_type loc;

  while (getline(inp,stringBuf,'\n')) {
    if (stringBuf.empty()) continue;

//		TRACE_ERR("stringBuf: " << stringBuf << std::endl);

    getNextPound(stringBuf, substring, "|||"); //first field
    sentence_index = substring;

    getNextPound(stringBuf, substring, "|||"); //second field
    theSentence = substring;

// adding statistics for error measures
    featentry.reset();
    scoreentry.clear();

    theScorer->prepareStats(sentence_index, theSentence, scoreentry);

    scoredata->add(scoreentry, sentence_index);

    getNextPound(stringBuf, substring, "|||"); //third field

    // examine first line for name of features
    if (!existsFeatureNames()) {
      std::string stringsupport=substring;
      std::string features="";
      std::string tmpname="";

      size_t tmpidx=0;
      while (!stringsupport.empty()) {
        //			TRACE_ERR("Decompounding: " << substring << std::endl);
        getNextPound(stringsupport, subsubstring);

        // string ending with ":" are skipped, because they are the names of the features
        if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
          features+=tmpname+"_"+stringify(tmpidx)+" ";
          tmpidx++;
        }
        // ignore sparse feature name
        else if (subsubstring.find("_") != string::npos) {
          // also ignore its value
          getNextPound(stringsupport, subsubstring);
        }
        // update current feature name
        else {
          tmpidx=0;
          tmpname=subsubstring.substr(0,subsubstring.size() - 1);
        }
      }

      featdata->setFeatureMap(features);
    }

    // adding features
    while (!substring.empty()) {
//			TRACE_ERR("Decompounding: " << substring << std::endl);
      getNextPound(substring, subsubstring);

      // no ':' -> feature value that needs to be stored
      if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
        featentry.add(ATOFST(subsubstring.c_str()));
      }
      // sparse feature name? store as well
      else if (subsubstring.find("_") != string::npos) {
        std::string name = subsubstring;
        getNextPound(substring, subsubstring);
        featentry.addSparse( name, atof(subsubstring.c_str()) );
        _sparse_flag = true;
      }
    }
    //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
    featdata->add(featentry,sentence_index);
  }

  inp.close();
}
Esempio n. 14
0
void DataAsiya::loadNBest(const string &file)
{
  TRACE_ERR("loading nbest from DataAsiya " << file << endl);
  inputfilestream inp(file); // matches a stream with a file. Opens the file
  if (!inp.good())
    throw runtime_error("Unable to open: " + file);

  ScoreStats scoreentry;
  string line, sentence_index, sentence, feature_str, alignment;

  AsiyaScorer* a_scorer = dynamic_cast<AsiyaScorer*>(m_scorer);
  
  /*todo. change this loop. instead of obtaining the score for each sentence, obtain all the scores at once!*/
  while (getline(inp, line, '\n')) {
    if (line.empty()) continue;
    // adding statistics for error measures
    scoreentry.clear();

    getNextPound(line, sentence_index, "|||"); // first field
    getNextPound(line, sentence, "|||");       // second field
    getNextPound(line, feature_str, "|||");    // third field

    if (line.length() > 0) {
      string temp;
      getNextPound(line, temp, "|||"); //fourth field sentence score
      if (line.length() > 0) {
        getNextPound(line, alignment, "|||"); //fourth field only there if alignment scorer
      }
    }
    //TODO check alignment exists if scorers need it
    if (a_scorer->useAlignment()) {
      sentence += "|||";
      sentence += alignment;
    }
    // prepare stats gets all the scores for sentence_i of sentence_index
//    a_scorer->addCandidateSentence(sentence_index, sentence);
    a_scorer->prepareStats(atoi(sentence_index.c_str()), sentence, scoreentry);

    // examine first line for name of features
    if (!existsFeatureNames()) {
      InitFeatureMap(feature_str);
    }
    AddFeatures(feature_str, atoi(sentence_index.c_str()));
  }

  a_scorer->doScoring();
//  TRACE_ERR("before getAllScoreStats" << endl);

  std::vector<std::vector <ScoreStats> > allScoreStats = a_scorer->getAllScoreStats();
  for (int i = 0; i < allScoreStats.size(); ++i)
      for(int j = 0; j < allScoreStats[i].size(); ++j)
      {
          stringstream ss;
          ss << i;
          m_score_data->add(allScoreStats[i][j], atoi(ss.str().c_str()));
//          TRACE_ERR("allScoreStats[" << i << "].size() " << allScoreStats[i].size() << " " << allScoreStats[i][j] << endl);
      }


  inp.close();
//  a_scorer->doScoring( m_score_data );

  //score each sentence
  //a_scorer->prepareStats(sentence_index, sentence, scoreentry);
  // save the score for previous sentence. Do it aling with previous function
  //m_score_data->add(scoreentry, sentence_index);  
  
}