Esempio n. 1
0
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
      std::vector<Data>& shards) 
{
  assert(shard_count);
  assert(shard_size >=0);
  assert(shard_size <= 1);

  size_t data_size = scoredata->size();
  assert(data_size == featdata->size());

  shard_size *=  data_size;

  for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
    vector<size_t> shard_contents;
    if (shard_size == 0) {
      //split into roughly equal size shards
      size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count);
      size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count);
      for (size_t i = shard_start; i < shard_end; ++i) {
        shard_contents.push_back(i);
      }
    } else {
      //create shards by randomly sampling
      for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
        shard_contents.push_back(rand() % data_size);
      }
    }
    
    ScorerFactory SF;
    Scorer* scorer = SF.getScorer(score_type, scorerconfig);

    shards.push_back(Data(*scorer));
    shards.back().score_type = score_type;
    shards.back().number_of_scores = number_of_scores;
    shards.back()._sparse_flag = _sparse_flag;
    for (size_t i = 0; i < shard_contents.size(); ++i) {
      shards.back().featdata->add(featdata->get(shard_contents[i]));
      shards.back().scoredata->add(scoredata->get(shard_contents[i]));
    }
    //cerr << endl;
    
  }
}
Esempio n. 2
0
int main(int argc, char** argv) {
    //defaults
    string scorerType("BLEU");
    string scorerConfig("");
    string referenceFile("");
    string nbestFile("");
    string scoreDataFile("statscore.data");
    string featureDataFile("features.data");
    string prevScoreDataFile("");
    string prevFeatureDataFile("");
    bool binmode = false;
    int verbosity = 0;
    int c;
    while ((c=getopt_long (argc,argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
        switch(c) {
            case 's':
                scorerType = string(optarg);
                break;
            case 'c':
                scorerConfig = string(optarg);
                break;
            case 'r':
                referenceFile = string(optarg);
                break;
            case 'b':
                binmode = true;
                break;
            case 'n':
                nbestFile = string(optarg);
                break;
            case 'S':
                scoreDataFile = string(optarg);
                break;
            case 'F':
                featureDataFile = string(optarg);
                break;
            case 'E':
                prevFeatureDataFile = string(optarg);
                break;
            case 'R':
                prevScoreDataFile = string(optarg);
                break;
            case 'v':
                verbosity = atoi(optarg);
                break;
            default:
                usage();
        }
    }
    try {

//check whether score statistics file is specified
    if (scoreDataFile.length() == 0){
	throw runtime_error("Error: output score statistics file is not specified");
    }

//check wheter feature file is specified
    if (featureDataFile.length() == 0){
        throw runtime_error("Error: output feature file is not specified");
    }

//check whether reference file is specified when nbest is specified
    if ((nbestFile.length() > 0 && referenceFile.length() == 0)){
        throw runtime_error("Error: reference file is not specified; you can not score the nbest");

    }
 
    vector<string> nbestFiles;
    if (nbestFile.length() > 0){
        std::string substring;
        while (!nbestFile.empty()){
                getNextPound(nbestFile, substring, ",");
                nbestFiles.push_back(substring);
        }
    }

    vector<string> referenceFiles;
    if (referenceFile.length() > 0){
			std::string substring;
			while (!referenceFile.empty()){
				getNextPound(referenceFile, substring, ",");
				referenceFiles.push_back(substring);
			}
    }

    vector<string> prevScoreDataFiles;
    if (prevScoreDataFile.length() > 0){
			std::string substring;
			while (!prevScoreDataFile.empty()){
				getNextPound(prevScoreDataFile, substring, ",");
				prevScoreDataFiles.push_back(substring);
			}
    }

    vector<string> prevFeatureDataFiles;
    if (prevFeatureDataFile.length() > 0){
        std::string substring;
        while (!prevFeatureDataFile.empty()){
                getNextPound(prevFeatureDataFile, substring, ",");
                prevFeatureDataFiles.push_back(substring);
        }
    }

    if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){
			throw runtime_error("Error: there is a different number of previous score and feature files");
    }

		
		if (binmode) cerr << "Binary write mode is selected" << endl;
		else cerr << "Binary write mode is NOT selected" << endl;
			
		TRACE_ERR("Scorer type: " << scorerType << endl);
		ScorerFactory sfactory;
		Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig);
				
		Timer timer;
		timer.start("Starting...");

		//load references        
		if (referenceFiles.size() > 0)
			scorer->setReferenceFiles(referenceFiles);

		Data data(*scorer);
		
	//load old data
		for (size_t i=0;i < prevScoreDataFiles.size(); i++){
			data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i));
		}

	//computing score statistics of each nbest file
		for (size_t i=0;i < nbestFiles.size(); i++){
			data.loadnbest(nbestFiles.at(i));
		}
								
		if (binmode)
			cerr << "Binary write mode is selected" << endl;
		else
			cerr << "Binary write mode is NOT selected" << endl;
			
		data.save(featureDataFile, scoreDataFile, binmode);
		timer.stop("Stopping...");
		return EXIT_SUCCESS;
    } catch (const exception& e) {
			cerr << "Exception: " << e.what() << endl;
			return EXIT_FAILURE;
    }

}