void VFPProdTable::init(int table_num, double datum_depth, FLO_TYPE flo_type, WFR_TYPE wfr_type, GFR_TYPE gfr_type, ALQ_TYPE alq_type, const std::vector<double>& flo_data, const std::vector<double>& thp_data, const std::vector<double>& wfr_data, const std::vector<double>& gfr_data, const std::vector<double>& alq_data, const array_type& data) { m_table_num = table_num; m_datum_depth = datum_depth; m_flo_type = flo_type; m_wfr_type = wfr_type; m_gfr_type = gfr_type; m_alq_type = alq_type; m_flo_data = flo_data; m_thp_data = thp_data; m_wfr_data = wfr_data; m_gfr_data = gfr_data; m_alq_data = alq_data; extents shape; shape[0] = data.shape()[0]; shape[1] = data.shape()[1]; shape[2] = data.shape()[2]; shape[3] = data.shape()[3]; shape[4] = data.shape()[4]; m_data.resize(shape); m_data = data; //check(); }
key_type key(const array_type & df, const std::size_t index) const { if (df.shape().second == 0) { return {0, 0}; } else { const int prod_id = df[df.row(index)][0]; // TODO, hardcoded const char segment = df[df.row(index)][8]; // TODO, hardcoded return {prod_id, segment}; } }
GroupBy(const array_type & df) { for (std::size_t ix{0}; ix < df.shape().first; ++ix) { const key_type k = key(df, ix); if (m_groups.count(k)) { m_groups.at(k).push_back(ix); } else { m_groups[k] = {ix}; } } m_current = m_groups.begin(); }
std::unique_ptr<void, int (*)(BoosterHandle)> fit(const array_type & train_data, const std::vector<float> & train_y, const std::map<const std::string, const std::string> & params, _StopCondition stop_condition) { // prepare placeholder for raw matrix later used by xgboost std::vector<float> train_vec = train_data.tovector(); std::cerr << "train_vec size: " << train_vec.size() << std::endl; // assert(std::none_of(train_vec.cbegin(), train_vec.cend(), [](float x){return std::isnan(x);})); std::unique_ptr<void, int (*)(DMatrixHandle)> tr_dmat( XGDMatrixCreateFromMat( train_vec.data(), train_data.shape().first, train_data.shape().second, XGB_MISSING), XGDMatrixFree); // attach response vector to tr_dmat XGDMatrixSetFloatInfo(tr_dmat.get(), "label", train_y.data(), train_y.size()); const DMatrixHandle cache[] = {tr_dmat.get()}; // create Booster with attached tr_dmat std::unique_ptr<void, int (*)(BoosterHandle)> booster( XGBoosterCreate(cache, 1UL), XGBoosterFree); for (const auto & kv : params) { std::cerr << kv.first << " => " << kv.second << std::endl; XGBoosterSetParam(booster.get(), kv.first.c_str(), kv.second.c_str()); } for (int iter{0}; stop_condition() == false; ++iter) { XGBoosterUpdateOneIter(booster.get(), iter, tr_dmat.get()); } return booster; }
std::vector<float> predict( BoosterHandle booster, const array_type & test_data) { std::vector<float> test_vec = test_data.tovector(); std::cerr << "test_vec size: " << test_vec.size() << std::endl; std::unique_ptr<void, int (*)(DMatrixHandle)> te_dmat( XGDMatrixCreateFromMat( test_vec.data(), test_data.shape().first, test_data.shape().second, XGB_MISSING), XGDMatrixFree); bst_ulong y_hat_len{0}; const float * y_hat_proba{nullptr}; XGBoosterPredict(booster, te_dmat.get(), 0, 0, &y_hat_len, &y_hat_proba); std::cerr << "Got y_hat_proba of length " << y_hat_len << std::endl; std::vector<float> y_hat(y_hat_proba, y_hat_proba + y_hat_len); return y_hat; }
std::vector<std::size_t> run_binary_estimators( const Iterator begin, const Iterator end, const long int time0, const array_type & train_data, const std::vector<float> & train_y, const array_type & test_data) { constexpr int TIME_MARGIN{60}; constexpr int MAX_TIME{600}; const int MAX_TIMESTAMP = time0 + MAX_TIME - TIME_MARGIN; std::cerr << std::endl << "Training " << std::distance(begin, end) << " estimator(s)" << std::endl; std::cerr << "Total time limit: " << MAX_TIME << " secs" << std::endl; // collection of probabilities predicted by each estimator std::vector<std::vector<float>> y_hat_proba_set; for (auto it = begin; it != end; ++it) { const auto & PARAMS_p = *it; const int MAX_ITER = std::stoi(PARAMS_p->at("n_estimators")); int iter{0}; auto booster = XGB::fit(train_data, train_y, *PARAMS_p, [&iter, MAX_ITER, MAX_TIMESTAMP]() -> bool { const bool running = (iter < MAX_ITER) && (timestamp() < MAX_TIMESTAMP); ++iter; return running == false; } ); if (iter <= MAX_ITER) { // time exceeded std::cerr << "Exceeded allocated time limit after iteration " << iter << " of " << MAX_ITER << " for estimator [" << y_hat_proba_set.size() + 1 << "]" << std::endl; // but we'll make the prediction anyway if it's our first estimator :) if (y_hat_proba_set.size() == 0) { y_hat_proba_set.push_back(XGB::predict(booster.get(), test_data)); } break; } auto proba = XGB::predict(booster.get(), test_data); y_hat_proba_set.push_back(proba); std::cerr << "Elapsed time: " << timestamp() - time0 << std::endl; } // array of propabilities accumulated from completed estimators std::vector<float> y_hat_proba_cumm(y_hat_proba_set.front().size(), 0.); for (std::size_t idx{0}; idx < y_hat_proba_set.size(); ++idx) { std::transform(y_hat_proba_set[idx].cbegin(), y_hat_proba_set[idx].cend(), y_hat_proba_cumm.begin(), y_hat_proba_cumm.begin(), [](const float x, const float a) { return a + x; }); } // quantized prediction std::vector<std::size_t> y_hat(test_data.shape().first); for (std::size_t ix{0}; ix < y_hat.size(); ++ix) { y_hat[ix] = y_hat_proba_cumm[ix] > 0.5; } return y_hat; }