Exemplo n.º 1
0
int EludeCaller::Run() {
  pair<int, double> best_model(-1, -1.0);
  if (!train_file_.empty() && !load_model_file_.empty() && VERB >= 4
      && !linear_calibration_) {
    cerr << "Warning: a model can be either trained or loaded from a file. "
         << "The two options should not be used together, unless linear calibration "
         << "should be carried out. In such a case please use the -j option. "
         << "The model will be trained using the peptides in " << train_file_ << endl;
  }
  // train a retention model
  if (!train_file_.empty()) {
    ProcessTrainData();
    // initialize the feature table
    train_features_table_ = DataManager::InitFeatureTable(
         RetentionFeatures::kMaxNumberFeatures, train_psms_);
    if (automatic_model_sel_) {
      best_model = AutomaticModelSelection();
    } else if (only_hydrophobicity_index_) {
    	map<string, double> custom_hydrophobicity_index = TrainRetentionIndex();
		  if (!index_file_.empty()) {
			  SaveRetentionIndexToFile(index_file_, custom_hydrophobicity_index);
		  } else {
			  PrintHydrophobicityIndex(custom_hydrophobicity_index);
		  }
    	cerr << "Now I saved the index" << endl;
    	return 0;
    } else if (load_model_file_.empty()) {
      TrainRetentionModel();
    }
  } else if (automatic_model_sel_) {
    if (!test_file_.empty()) {
      ProcessTestData();
      processed_test_ = true;
    }
    best_model = AutomaticModelSelection();
  }

  // load a model from a file
  if (!load_model_file_.empty() && !automatic_model_sel_) {
    rt_model_ = new RetentionModel(the_normalizer_);
    rt_model_->LoadModelFromFile(load_model_file_);
  }
  // save the model
  if (!save_model_file_.empty()) {
    if (rt_model_ != NULL && !rt_model_->IsModelNull()) {
      rt_model_->SaveModelToFile(save_model_file_);
    } else if (VERB >= 2) {
      cerr << "Warning: No trained model available. Nothing to save to "
           << save_model_file_ << endl;
    }
  }
  // append a file to the library
  if (append_model_) {
    if (automatic_model_sel_) {
      if (VERB >= 3) {
        cerr << "Warning: The model should already be in the library if "
             << "the automatic model selection option is employed. No model "
             << "will be appended to the library"<< endl;
      }
    } else if (rt_model_ == NULL) {
      if (VERB >= 3) {
        cerr << "Warning: No model available, nothing to append to the library."
             << endl;
      }
    } else {
      AddModelLibrary();
    }
  }
  // save the retention index to a file
  if (!index_file_.empty()) {
    SaveIndexToFile(best_model.first);
  }
  // test a model
  if (!test_file_.empty()) {
    // process the test data
    if (!processed_test_) {
      ProcessTestData();
    }
    if (test_psms_.size() <= 0) {
      if (VERB >= 3) {
        cerr << "Warning: no test psms available, nothing to do. " << endl;
        return 0;
      }
    }
    // initialize the feature table
    test_features_table_ = DataManager::InitFeatureTable(
            RetentionFeatures::kMaxNumberFeatures, test_psms_);
    int ret = 1;
    if (automatic_model_sel_) {
      int index = best_model.first;
      if (index < 0) {
        if (VERB >= 2) {
          cerr << "Error: No model available to predict rt. Execution aborted." << endl;
        }
        return 0;
      }
      rt_models_[index]->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms",
          test_psms_);
      if (linear_calibration_ && train_psms_.size() > 1) {
        rt_models_[index]->PredictRT(train_aa_alphabet_, ignore_ptms_, "calibration psms",
            train_psms_);
      }
    } else {
      int ret = rt_model_->PredictRT(test_aa_alphabet_, ignore_ptms_, "test psms",
          test_psms_);
      if (ret != 0) {
        if (VERB >= 2) {
          cerr << "Error: the amino acids alphabet in the test data does not match "
               <<"the ones used to train the model. Please use the -p option to ignore the ptms "
               <<"in the test data data are were not present in the training set " << endl;
        }
        return 0;
      }
      if (linear_calibration_ && train_psms_.size() > 1) {
        ret = rt_model_->PredictRT(train_aa_alphabet_, ignore_ptms_, "training psms",
            train_psms_);
        if (ret != 0) {
          if (VERB >= 2) {
          cerr << "Error: the amino acids alphabet in training data does not match "
               <<"the one used to train the model. Please use the -p option to ignore the ptms "
               <<"that were not present in the set used to train the model " << endl;
          }
          return 0;
        }
      }
    }
    // linear calibration is performed only for automatic model selection or when
    // loading a model from a file
    if (linear_calibration_ && (automatic_model_sel_ || (!load_model_file_.empty() &&
        train_psms_.size() >= 2))) {
      if (train_psms_.size() <= 1 && !automatic_model_sel_) {
        if (VERB >= 3) {
          cerr << "Warning: at least 2 training psms are needed to calibrate the model. "
               << "No calibration performed. " << endl;
         }
       } else {
         // get the a and b coefficients
         if (linear_calibration_ && train_psms_.size() < 2) {
           if (VERB >= 4) {
             cerr << "Warning: No (enough) calibration peptides. Linear calibration "
                  << "cannot be performed " << endl;
           }
         } else {
           pair<vector<double> , vector<double> > rts = GetRTs(train_psms_);
           lts = new LTSRegression();
           lts->setData(rts.first, rts.second);
           lts->runLTS();
           AdjustLinearly(test_psms_);
         }
       }
    }
    // compute performance measures
    if (test_includes_rt_) {
      double rank_correl = ComputeRankCorrelation(test_psms_);
      double pearson_correl = ComputePearsonCorrelation(test_psms_);
      double win = ComputeWindow(test_psms_);
      if (VERB >= 3) {
        cerr << "Performance measures for the test data: " << endl;
        cerr << "  Pearson's correlation r = " << pearson_correl << endl;
        cerr << "  Spearman's rank correlation rho = " << rank_correl << endl;
        cerr << "  Delta_t 95% = " << win << endl;
      }
    }
    // write the predictions to file
    if (!output_file_.empty()) {
      DataManager::WriteOutFile(output_file_, test_psms_, test_includes_rt_);
    } else {
      if (VERB >= 2 && !supress_print_) {
        PrintPredictions(test_psms_);
      }
    }
  }
  return 0;
}
Exemplo n.º 2
0
int PredictPtron(int argc, char **argv, ostream& errMsg){
  errMsg << endl;
  errMsg << "Perceptron prediction using backpropagation with gradient descent:\n";
  errMsg << "[mode]: continue training(-1) training(0), testing(1), both(2), Anomaly Detection(3), AD (4), ADAM (5), clean AD (6), clean ADAM(7)\n";
  errMsg << "[norm]: unnormalized(0)/normalized(1) data\n";
  errMsg << endl;

  if ( argc < 4 ) return(0);

  string test_filename;
  string train_filename;
  //
  string prefix = argv[2];
  cout << "# I/O prefix: " << prefix << endl;
  //
  int mode = atoi(argv[3]);
  cout << "# testing/training mode is: " << mode << endl;
  if( -1 > mode || mode > 8 ){
    cerr << "Assert: Invalid parameter [mode]\n\n";
    exit(-1);
  }
  if( mode == 4 || mode==5) return( Ptron_edam( argc, argv, errMsg ) );
  if( mode == 6 || mode==7) return( Ptron_clean( argc, argv, errMsg ) );
  errMsg << "[lrate]: learning rate. \n";
  errMsg << "[eta]: momentum factor.\n";
  errMsg << "[stopErr]: Stopping criterion: MSE < stopErr.\n";
  errMsg << "[stopIter]: Stopping criterion: maximum number of iterations.\n";
  errMsg << "<numIn>: Size of input vector.\n";
  errMsg << "<nOut> number of nodes in output layer\n";
  errMsg << "[fname]: name of file with error data\n";
  if ( argc < 8 )return(0);
  //
  int norm = atoi(argv[4]);
  cout << "# using normalized data: " << norm << endl;
  //
  train_filename = prefix + "-train.dat";
  test_filename = prefix + "-test.dat";
  string npfname = prefix + "-norm_param.dat";
  //
  double lrate = atof(argv[5]);
  cout << "# lrate is: " << lrate << endl;
  double eta = atof( argv[6] );
  cout << "# eta is: " << eta << endl;
  float stopErr = atof( argv[7] );
  int stopIter = atoi( argv[8] );
  cout << "# Training will terminate either when minimum MSE change is: " << stopErr << endl;
  cout << "#   or when " << stopIter << " training iterations have been performed.\n";
  //
  int num_inputs;
  int num_outputs;
  if( mode == -1 || mode == 0 || mode == 2 ){
    if( argc != 11 ){
      cerr << "ERROR: PredictPtron(int argc, char **argv) -- need architecture information.\n";
      return( 0 );
    }
    cout << "# Neural Network Architecture is: ";
    num_inputs = atoi( argv[9] );
    cout << num_inputs << " ";
    num_outputs = atoi( argv[10] );
    cout << num_outputs << endl;
  }
  else if( mode == 3 || mode == 4){
    if( argc != 9 ) {
      cerr << "Assert: Need to input error datafile\n";
      exit(-1);
    }
    //mode = 2;  // testing only
    test_filename = argv[9];
  }
 
  //
  ////////////////////////////////////////////////////////////
  //  Begin Test/Train
  ////////////////////////////////////////////////////////////
  adet_ptron model;
  //
  // if training only or both training and testing
  // train naive predictor n1
  if( mode == -1 || mode == 0 || mode == 2 )
  {
    //
    //  Read in training data
    vector< double > jdate_train;
    vector< double > jdate_test;
    vector< vector< float > > TrainExamples;
    vector< vector< float > > TestExamples;
    vector< vector< float > > normParam;
    
    //ReadTSData( train_filename, norm, jdate_train, TrainExamples, normParam);
    GetTTExamples( npfname, train_filename, jdate_train, TrainExamples, normParam  );
    GetTTExamples( npfname, test_filename, jdate_test, TestExamples, normParam );
    if( norm == 1 ){
      NormalizeExamples( 1, TrainExamples, normParam );
    }
    //
    //
    if( mode == -1 ){
      string ifile_name = prefix + "-ptron_predictor.out";
      ifstream ifile( ifile_name.c_str() );
      if( !ifile )
      {
        cerr << "Assert: could not open file " << ifile_name << endl;
        exit(-1);
      }
      model = adet_ptron( ifile );
      ifile.close();
      model.ResetStopCrit( double(stopErr), stopIter );
    }
    else{
      model = adet_ptron( num_inputs, num_outputs, stopErr, stopIter );
    }
    //
    // Train network
    //model.TrainXV( TrainExamples, TestExamples, lrate, eta, 1., 1. );
    model.k_FoldXV( 10, TrainExamples, lrate, eta, 1., 1. );
    string ofile_name = prefix + "-ptron_predictor.out";
    ofstream ofile( ofile_name.c_str() );
    if( !ofile )
    {
      cerr << "Assert: could not open file " << ofile_name << endl;
      exit(-1);
    }
    model.Print( ofile );
    ofile.close();
    // 
    // Evaluate predictor performance on training set
    vector< vector< float > > Results_Train;
    Results_Train = model.Test( TrainExamples );
    //
    // if using normalized values, unnormalize the results
    if( norm == 1 ){
      //cout << "Attempting to UnNormalize the results " << endl;
      UnnormalizeResults( Results_Train, normParam );
    }
    //
    // Print out training error
    cout << "#   Anomaly Detection Results:\n";
    PrintError( Results_Train );
    //PrintPredictions( Results_Train, jdate_train );
  }
  //
  // Testing only, so initialize ann predictor from file
  else
  {
    string ifile_name = prefix + "-ptron_predictor.out";
    ifstream ifile( ifile_name.c_str() );
    if( !ifile )
    {
      cerr << "Assert: could not open file " << ifile_name << endl;
      exit(-1);
    }
    model = adet_ptron( ifile );
    ifile.close();
//    model.Print(cout);
//    ofstream ofile( "test_percep.out" );
//    if( !ofile )
//    {
//      cerr << "Assert: could not open file test_percep.out" << endl;
//      exit(-1);
//    }
//    p1.Output( ofile );
//    ofile.close();
  }
  //
  //  Testing only, both Train/Test, and Anomaly Detection
  //  Evaluate performance of predictor on Testing set
  if ( mode == 1 || mode == 2 || mode == 3)
  {
    //
    //  Read in testing data
    vector< double > jdate_test;
    vector< vector< float > > TestExamples;
    vector< vector< float > > normParam;
    //ReadTSData( test_filename, norm, jdate_test, TestExamples, normParam); 01/06
    GetTTExamples( npfname, test_filename, jdate_test, TestExamples, normParam  );
    if( norm == 1 ){
      NormalizeExamples( 1, TestExamples, normParam );
    }
    // 
    // Evaluate predictor performance on testing set
    vector< vector< float > > Results_Test;
    Results_Test = model.Test( TestExamples );
    //
    // if using normalized values, unnormalize the results
    if( norm == 1 )
    {
      UnnormalizeResults( Results_Test, normParam );
    }
    //
    if( mode == 3 )
    {
      //
      // Print out anomalies found
      FindAnomalies( Results_Test, jdate_test );
      PrintPredictions( Results_Test, jdate_test );
    }
    else
    {
      //
      // Print out testing error
      cout << "#   Anomaly Detection Results:\n";
      PrintError( Results_Test );
      PrintPredictions( Results_Test, jdate_test );
    }
  }
  return( 1 );
}