Ejemplo n.º 1
0
void VFPProdTable::init(int table_num,
                        double datum_depth,
                        FLO_TYPE flo_type,
                        WFR_TYPE wfr_type,
                        GFR_TYPE gfr_type,
                        ALQ_TYPE alq_type,
                        const std::vector<double>& flo_data,
                        const std::vector<double>& thp_data,
                        const std::vector<double>& wfr_data,
                        const std::vector<double>& gfr_data,
                        const std::vector<double>& alq_data,
                        const array_type& data) {
    m_table_num = table_num;
    m_datum_depth = datum_depth;
    m_flo_type = flo_type;
    m_wfr_type = wfr_type;
    m_gfr_type = gfr_type;
    m_alq_type = alq_type;
    m_flo_data = flo_data;
    m_thp_data = thp_data;
    m_wfr_data = wfr_data;
    m_gfr_data = gfr_data;
    m_alq_data = alq_data;

    extents shape;
    shape[0] = data.shape()[0];
    shape[1] = data.shape()[1];
    shape[2] = data.shape()[2];
    shape[3] = data.shape()[3];
    shape[4] = data.shape()[4];
    m_data.resize(shape);
    m_data = data;

    //check();
}
    key_type key(const array_type & df, const std::size_t index) const
    {
        if (df.shape().second == 0)
        {
            return {0, 0};
        }
        else
        {
            const int prod_id = df[df.row(index)][0]; // TODO, hardcoded
            const char segment = df[df.row(index)][8]; // TODO, hardcoded

            return {prod_id, segment};
        }
    }
    GroupBy(const array_type & df)
    {
        for (std::size_t ix{0}; ix < df.shape().first; ++ix)
        {
            const key_type k = key(df, ix);

            if (m_groups.count(k))
            {
                m_groups.at(k).push_back(ix);
            }
            else
            {
                m_groups[k] = {ix};
            }
        }

        m_current = m_groups.begin();
    }
std::unique_ptr<void, int (*)(BoosterHandle)>
fit(const array_type & train_data,
    const std::vector<float> & train_y,
    const std::map<const std::string, const std::string> & params,
    _StopCondition stop_condition)
{
    // prepare placeholder for raw matrix later used by xgboost
    std::vector<float> train_vec = train_data.tovector();
    std::cerr << "train_vec size: " << train_vec.size() << std::endl;
//    assert(std::none_of(train_vec.cbegin(), train_vec.cend(), [](float x){return std::isnan(x);}));

    std::unique_ptr<void, int (*)(DMatrixHandle)> tr_dmat(
        XGDMatrixCreateFromMat(
            train_vec.data(),
            train_data.shape().first,
            train_data.shape().second, XGB_MISSING),
        XGDMatrixFree);

    // attach response vector to tr_dmat
    XGDMatrixSetFloatInfo(tr_dmat.get(), "label", train_y.data(), train_y.size());

    const DMatrixHandle cache[] = {tr_dmat.get()};

    // create Booster with attached tr_dmat
    std::unique_ptr<void, int (*)(BoosterHandle)> booster(
            XGBoosterCreate(cache, 1UL),
            XGBoosterFree);

    for (const auto & kv : params)
    {
        std::cerr << kv.first << " => " << kv.second << std::endl;
        XGBoosterSetParam(booster.get(), kv.first.c_str(), kv.second.c_str());
    }


    for (int iter{0}; stop_condition() == false; ++iter)
    {
        XGBoosterUpdateOneIter(booster.get(), iter, tr_dmat.get());
    }

    return booster;
}
std::vector<float>
predict(
    BoosterHandle booster,
    const array_type & test_data)
{
    std::vector<float> test_vec = test_data.tovector();
    std::cerr << "test_vec size: " << test_vec.size() << std::endl;

    std::unique_ptr<void, int (*)(DMatrixHandle)> te_dmat(
        XGDMatrixCreateFromMat(
            test_vec.data(),
            test_data.shape().first,
            test_data.shape().second, XGB_MISSING),
        XGDMatrixFree);

    bst_ulong y_hat_len{0};
    const float * y_hat_proba{nullptr};
    XGBoosterPredict(booster, te_dmat.get(), 0, 0, &y_hat_len, &y_hat_proba);
    std::cerr << "Got y_hat_proba of length " << y_hat_len << std::endl;

    std::vector<float> y_hat(y_hat_proba, y_hat_proba + y_hat_len);

    return y_hat;
}
std::vector<std::size_t>
run_binary_estimators(
    const Iterator begin,
    const Iterator end,
    const long int time0,
    const array_type & train_data,
    const std::vector<float> & train_y,
    const array_type & test_data)
{
    constexpr int   TIME_MARGIN{60};
    constexpr int   MAX_TIME{600};
    const int       MAX_TIMESTAMP = time0 + MAX_TIME - TIME_MARGIN;

    std::cerr << std::endl << "Training " << std::distance(begin, end) << " estimator(s)" << std::endl;
    std::cerr << "Total time limit: " << MAX_TIME << " secs" << std::endl;

    // collection of probabilities predicted by each estimator
    std::vector<std::vector<float>> y_hat_proba_set;

    for (auto it = begin; it != end; ++it)
    {
        const auto & PARAMS_p = *it;

        const int MAX_ITER = std::stoi(PARAMS_p->at("n_estimators"));
        int iter{0};

        auto booster = XGB::fit(train_data, train_y, *PARAMS_p,
            [&iter, MAX_ITER, MAX_TIMESTAMP]() -> bool
            {
                const bool running = (iter < MAX_ITER) && (timestamp() < MAX_TIMESTAMP);
                ++iter;
                return running == false;
            }
        );

        if (iter <= MAX_ITER)
        {
            // time exceeded
            std::cerr << "Exceeded allocated time limit after iteration " << iter << " of " << MAX_ITER << " for estimator [" << y_hat_proba_set.size() + 1 << "]" << std::endl;

            // but we'll make the prediction anyway if it's our first estimator :)
            if (y_hat_proba_set.size() == 0)
            {
                y_hat_proba_set.push_back(XGB::predict(booster.get(), test_data));
            }
            break;
        }

        auto proba = XGB::predict(booster.get(), test_data);

        y_hat_proba_set.push_back(proba);

        std::cerr << "Elapsed time: " << timestamp() - time0 << std::endl;
    }

    // array of propabilities accumulated from completed estimators
    std::vector<float> y_hat_proba_cumm(y_hat_proba_set.front().size(), 0.);

    for (std::size_t idx{0}; idx < y_hat_proba_set.size(); ++idx)
    {
        std::transform(y_hat_proba_set[idx].cbegin(), y_hat_proba_set[idx].cend(), y_hat_proba_cumm.begin(),
            y_hat_proba_cumm.begin(),
            [](const float x, const float a)
            {
                return a + x;
            });
    }

    // quantized prediction
    std::vector<std::size_t> y_hat(test_data.shape().first);

    for (std::size_t ix{0}; ix < y_hat.size(); ++ix)
    {
        y_hat[ix] = y_hat_proba_cumm[ix] > 0.5;
    }

    return y_hat;
}