void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig, std::vector<Data>& shards) { assert(shard_count); assert(shard_size >=0); assert(shard_size <= 1); size_t data_size = scoredata->size(); assert(data_size == featdata->size()); shard_size *= data_size; for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) { vector<size_t> shard_contents; if (shard_size == 0) { //split into roughly equal size shards size_t shard_start = floor(0.5 + shard_id * (float)data_size / shard_count); size_t shard_end = floor(0.5 + (shard_id+1) * (float)data_size / shard_count); for (size_t i = shard_start; i < shard_end; ++i) { shard_contents.push_back(i); } } else { //create shards by randomly sampling for (size_t i = 0; i < floor(shard_size+0.5); ++i) { shard_contents.push_back(rand() % data_size); } } ScorerFactory SF; Scorer* scorer = SF.getScorer(score_type, scorerconfig); shards.push_back(Data(*scorer)); shards.back().score_type = score_type; shards.back().number_of_scores = number_of_scores; shards.back()._sparse_flag = _sparse_flag; for (size_t i = 0; i < shard_contents.size(); ++i) { shards.back().featdata->add(featdata->get(shard_contents[i])); shards.back().scoredata->add(scoredata->get(shard_contents[i])); } //cerr << endl; } }
int main(int argc, char** argv) { //defaults string scorerType("BLEU"); string scorerConfig(""); string referenceFile(""); string nbestFile(""); string scoreDataFile("statscore.data"); string featureDataFile("features.data"); string prevScoreDataFile(""); string prevFeatureDataFile(""); bool binmode = false; int verbosity = 0; int c; while ((c=getopt_long (argc,argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) { switch(c) { case 's': scorerType = string(optarg); break; case 'c': scorerConfig = string(optarg); break; case 'r': referenceFile = string(optarg); break; case 'b': binmode = true; break; case 'n': nbestFile = string(optarg); break; case 'S': scoreDataFile = string(optarg); break; case 'F': featureDataFile = string(optarg); break; case 'E': prevFeatureDataFile = string(optarg); break; case 'R': prevScoreDataFile = string(optarg); break; case 'v': verbosity = atoi(optarg); break; default: usage(); } } try { //check whether score statistics file is specified if (scoreDataFile.length() == 0){ throw runtime_error("Error: output score statistics file is not specified"); } //check wheter feature file is specified if (featureDataFile.length() == 0){ throw runtime_error("Error: output feature file is not specified"); } //check whether reference file is specified when nbest is specified if ((nbestFile.length() > 0 && referenceFile.length() == 0)){ throw runtime_error("Error: reference file is not specified; you can not score the nbest"); } vector<string> nbestFiles; if (nbestFile.length() > 0){ std::string substring; while (!nbestFile.empty()){ getNextPound(nbestFile, substring, ","); nbestFiles.push_back(substring); } } vector<string> referenceFiles; if (referenceFile.length() > 0){ std::string substring; while (!referenceFile.empty()){ getNextPound(referenceFile, substring, ","); referenceFiles.push_back(substring); } } vector<string> prevScoreDataFiles; if (prevScoreDataFile.length() > 0){ std::string substring; while (!prevScoreDataFile.empty()){ getNextPound(prevScoreDataFile, substring, ","); prevScoreDataFiles.push_back(substring); } } vector<string> prevFeatureDataFiles; if (prevFeatureDataFile.length() > 0){ std::string substring; while (!prevFeatureDataFile.empty()){ getNextPound(prevFeatureDataFile, substring, ","); prevFeatureDataFiles.push_back(substring); } } if (prevScoreDataFiles.size() != prevFeatureDataFiles.size()){ throw runtime_error("Error: there is a different number of previous score and feature files"); } if (binmode) cerr << "Binary write mode is selected" << endl; else cerr << "Binary write mode is NOT selected" << endl; TRACE_ERR("Scorer type: " << scorerType << endl); ScorerFactory sfactory; Scorer* scorer = sfactory.getScorer(scorerType,scorerConfig); Timer timer; timer.start("Starting..."); //load references if (referenceFiles.size() > 0) scorer->setReferenceFiles(referenceFiles); Data data(*scorer); //load old data for (size_t i=0;i < prevScoreDataFiles.size(); i++){ data.load(prevFeatureDataFiles.at(i), prevScoreDataFiles.at(i)); } //computing score statistics of each nbest file for (size_t i=0;i < nbestFiles.size(); i++){ data.loadnbest(nbestFiles.at(i)); } if (binmode) cerr << "Binary write mode is selected" << endl; else cerr << "Binary write mode is NOT selected" << endl; data.save(featureDataFile, scoreDataFile, binmode); timer.stop("Stopping..."); return EXIT_SUCCESS; } catch (const exception& e) { cerr << "Exception: " << e.what() << endl; return EXIT_FAILURE; } }