Пример #1
0
int main(int argc, char *argv[]) {
  std::srand ( unsigned ( std::time(0) ) );

#ifdef USE_OPENMP
  const int threads_wanted = 4;
  omp_set_num_threads(threads_wanted);
#endif

  g_conf.number_of_feature = 79;
  g_conf.max_depth = 6;
  g_conf.iterations = 10;
  g_conf.shrinkage = 0.1F;

  if (argc < 3) return -1;

  std::string train_file(argv[1]);
  std::string test_file(argv[2]);

  if (argc > 3) {
    g_conf.max_depth = boost::lexical_cast<int>(argv[3]);
  }

  if (argc > 4) {
    g_conf.iterations = boost::lexical_cast<int>(argv[4]);
  }

  if (argc > 5) {
    g_conf.shrinkage = boost::lexical_cast<float>(argv[5]);
  }

  if (argc > 6) {
    g_conf.feature_sample_ratio = boost::lexical_cast<float>(argv[6]);
  }

  if (argc > 7) {
    g_conf.data_sample_ratio = boost::lexical_cast<float>(argv[7]);
  }

  int debug = 0;
  if (argc > 8) {
    debug = boost::lexical_cast<int>(argv[8]);
  }

  g_conf.loss = LOG_LIKELIHOOD;
  g_conf.debug = debug > 0? true : false;

  DataVector d;
  bool r = LoadDataFromFile(train_file, &d);
  assert(r);

  g_conf.min_leaf_size = d.size() / 40;
  std::cout << "configure: " << std::endl
            << g_conf.ToString() << std::endl;

  if (argc > 9) {
    g_conf.LoadFeatureCost(argv[9]);
  }

  GBDT gbdt;
  Elapsed elapsed;
  gbdt.Fit(&d);
  std::cout << "fit time: " << elapsed.Tell() << std::endl;

  std::string model_file = train_file + ".model";
  std::ofstream model_output(model_file.c_str());
  model_output << gbdt.Save();

  CleanDataVector(&d);
  FreeVector(&d);

  DataVector d2;
  r = LoadDataFromFile(test_file, &d2);
  assert(r);

  elapsed.Reset();
  DataVector::iterator iter = d2.begin();
  PredictVector predict;
  for ( ; iter != d2.end(); ++iter) {
    ValueType p = Logit(gbdt.Predict(**iter));
    predict.push_back(p);

  }
  std::cout << "predict time: " << elapsed.Tell() << std::endl;

  std::string predict_file = test_file + ".predict";
  std::ofstream predict_output(predict_file.c_str());

  Auc auc;
  for (size_t i = 0; i < d2.size(); ++i) {
    predict_output << predict[i] << " " << d2[i]->ToString() << std::endl;
    auc.Add(predict[i], d2[i]->label);
  }
  std::cout << "auc: " << auc.CalculateAuc() << std::endl;
  auc.PrintConfusionTable();

  CleanDataVector(&d2);

  return 0;
}
Пример #2
0
int main(int argc, char *argv[]) {
  std::srand ( unsigned ( std::time(0) ) );

  g_conf.number_of_feature = 3;
  g_conf.max_depth = 4;
  g_conf.iterations = 100;
  g_conf.shrinkage = 0.1F;

  if (argc < 3) return -1;

  std::string train_file(argv[1]);
  std::string test_file(argv[2]);

  if (argc > 3) {
    g_conf.max_depth = boost::lexical_cast<int>(argv[3]);
  }

  if (argc > 4) {
    g_conf.iterations = boost::lexical_cast<int>(argv[4]);
  }

  if (argc > 5) {
    g_conf.shrinkage = boost::lexical_cast<float>(argv[5]);
  }

  if (argc > 6) {
    g_conf.feature_sample_ratio = boost::lexical_cast<float>(argv[6]);
  }

  if (argc > 7) {
    g_conf.data_sample_ratio = boost::lexical_cast<float>(argv[7]);
  }

  g_conf.debug = true;
  // g_conf.loss = LOG_LIKELIHOOD;
  g_conf.loss = SQUARED_ERROR;

  DataVector d;
  bool r = LoadDataFromFile(train_file, &d);
  assert(r);

  // g_conf.min_leaf_size = d.size() / 10;

  std::cout << g_conf.ToString() << std::endl;

  GBDT gbdt;

  Elapsed elapsed;
  gbdt.Fit(&d);
  std::cout << "fit time: " << elapsed.Tell() << std::endl;
  CleanDataVector(&d);
  FreeVector(&d);

  std::string model_file = train_file + ".model";
  std::ofstream model_output(model_file.c_str());
  model_output << gbdt.Save();
  GBDT gbdt2;
  gbdt2.Load(gbdt.Save());

  DataVector d2;
  r = LoadDataFromFile(test_file, &d2);
  assert(r);

  elapsed.Reset();
  DataVector::iterator iter = d2.begin();
  PredictVector predict;
  for ( ; iter != d2.end(); ++iter) {
    ValueType p;
    if (g_conf.loss == SQUARED_ERROR) {
      p = gbdt2.Predict(**iter);
      predict.push_back(p);
    } else if (g_conf.loss == LOG_LIKELIHOOD) {
      p = gbdt2.Predict(**iter);
      p = Logit(p);
      if (p >= 0.5)
        p = 1;
      else
        p = -1;
      predict.push_back(p);
    }
    // std::cout << (*iter)->ToString() << std::endl
    //           << p << std::endl;
  }

  std::cout << "predict time: " << elapsed.Tell() << std::endl;
  std::cout << "rmse: " << RMSE(d2, predict) << std::endl;

  CleanDataVector(&d2);

  return 0;
}
Пример #3
0
int main(int argc, char ** argv)
{
    std::string input_file = "";
    std::string input_type = "l2r";
    std::string config_file = "./gbrt.conf";
    std::string act_type = "";
    std::string model_file = "./gbrt.model";
    int dimention = 1024;

    //----parse command line
    int opt_c;
    while ( (opt_c = getopt( argc, argv, "d:f:i:c:m:tp")) != EOF )
    {
        switch (opt_c)
        {
        case 'i':
            input_file = optarg;
            break;
        case 'f':
            input_type = optarg;
            break;
        case 'c':
            config_file = optarg;
            break;
        case 'm':
            model_file = optarg;
            break;
        case 't':
            act_type = "t";
            break;
        case 'p':
            act_type = "p";
            break;
        case 'd':
            dimention = atoi(optarg);
        default:
            break;
        }

    }

    //check options
    if ( act_type.length() == 0
        || input_file.length() == 0 )
    {
        std::cerr << "miss parameter!!" << endl;
        Usage();
        return 1;
    }
    else
    {
        cout << "parameters--------" << endl;
        cout << "  input file: " << input_file << endl;
        cout << "  input format (cvs, l2r): " << input_type<< endl;
        cout << "  config file: " << config_file << endl;
        cout << "  act type(t for train,p for predict): " << act_type << endl;
        cout << "  model file: " << model_file << endl;
        cout << "  max dimention(for L2R format): " << dimention << endl;
        cout << endl;
    }

    Data data;
    DataReader dr;

    if ( input_type == "cvs")
    {
         if ( false == dr.ReadDataFromCVS(input_file, data))
         {
             std::cerr << "error: read CVS file failed! " << input_file << std::endl;
             return 1;
         }
    }
    else
    {
        if ( false == dr.ReadDataFromL2R(input_file, data, dimention))
        {
         std::cerr << "error: read L2R file failed! " << input_file << std::endl;
         return 1;
        }
    }

    GBDT gbdt;

    if (!gbdt.LoadConfig(config_file))
        return 1;

    if (act_type == "t")
    {
        gbdt.Init();
        gbdt.Train(data);
        gbdt.SaveWeights(model_file);
    }
    else if( act_type == "p" )
    {
        T_VECTOR predictions;
        gbdt.LoadWeights(model_file);
        gbdt.PredictAllOutputs(data, predictions);

        //----output prediction----
        std::ifstream fs;
        fs.open(input_file.c_str(), std::ios_base::in);

        std::string prediction_file = input_file + ".prediction";
        std::fstream fs_out;
        fs_out.open(prediction_file.c_str(), std::ios_base::out);

        std::string strLine;
        unsigned int line_num = 0;
        while (getline(fs, strLine))
        {
             if (strLine.length() < 2)
             {
                 continue;
             }
             fs_out<< predictions[line_num] << std::endl;
             //for debug
             //cout << strLine << "\t" << predictions[line_num] << std::endl;
             line_num++;
        }

        fs.close();

    }


    return 0;
}