Ejemplo n.º 1
0
matrix_eig TimeSeriesLinearReg::Predict(const matrix_eig &X,
                                        UNUSED_ATTRIBUTE int bsz) const {
  matrix_eig X_proc;
  ModelUtil::GenerateFeatureMatrix(*this, X, X_proc);
  matrix_eig y_hat(X_proc.rows(), weights_.size());
  for (long label_idx = 0; label_idx < y_hat.cols(); label_idx++) {
    y_hat.col(label_idx) = (X_proc * weights_[label_idx]).transpose();
  }
  return y_hat;
}
Ejemplo n.º 2
0
void TimeSeriesLinearReg::Fit(const matrix_eig &X, const matrix_eig &y,
                              UNUSED_ATTRIBUTE int bsz) {
  matrix_eig X_proc;
  ModelUtil::GenerateFeatureMatrix(*this, X, X_proc);
  matrix_eig XTX = X_proc.transpose() * X_proc;
  XTX += matrix_eig::Identity(XTX.rows(), XTX.rows()) * epsilon_;
  XTX = (XTX.inverse() * (X_proc.transpose()));
  matrix_eig y_hat(y.rows(), y.cols());
  for (long label_idx = 0; label_idx < y.cols(); label_idx++) {
    weights_.emplace_back(XTX * y.col(label_idx));
  }
}
std::vector<float>
predict(
    BoosterHandle booster,
    const array_type & test_data)
{
    std::vector<float> test_vec = test_data.tovector();
    std::cerr << "test_vec size: " << test_vec.size() << std::endl;

    std::unique_ptr<void, int (*)(DMatrixHandle)> te_dmat(
        XGDMatrixCreateFromMat(
            test_vec.data(),
            test_data.shape().first,
            test_data.shape().second, XGB_MISSING),
        XGDMatrixFree);

    bst_ulong y_hat_len{0};
    const float * y_hat_proba{nullptr};
    XGBoosterPredict(booster, te_dmat.get(), 0, 0, &y_hat_len, &y_hat_proba);
    std::cerr << "Got y_hat_proba of length " << y_hat_len << std::endl;

    std::vector<float> y_hat(y_hat_proba, y_hat_proba + y_hat_len);

    return y_hat;
}
std::vector<std::string>
ElectronicPartsClassification::classifyParts(
    std::vector<std::string> & i_training,
    std::vector<std::string> & i_testing) const
{

    const std::vector<std::string> raw_colnames{"PRODUCT_NUMBER", "CUSTOMER_NUMBER", "TRANSACTION_DATE",
        "PRODUCT_PRICE", "GROSS_SALES", "REGION", "WAREHOUSE", "CUSTOMER_ZIP", "CUSTOMER_SEGMENT1",
        "CUSTOMER_SEGMENT2", "CUSTOMER_TYPE1", "CUSTOMER_TYPE2", "CUSTOMER_MANAGED_LEVEL",
        "CUSTOMER_ACCOUNT_TYPE", "CUSTOMER_FIRST_ORDER_DATE", "PRODUCT_CLASS_ID1",
        "PRODUCT_CLASS_ID2", "PRODUCT_CLASS_ID3", "PRODUCT_CLASS_ID4","BRAND",
        "PRODUCT_ATTRIBUTE_X", "PRODUCT_SALES_UNIT", "SHIPPING_WEIGHT", "TOTAL_BOXES_SOLD",
        "PRODUCT_COST1", "PRODUCT_UNIT_OF_MEASURE", "ORDER_SOURCE", "PRICE_METHOD", "SPECIAL_PART"};


    const auto time0 = timestamp();

    const num::loadtxtCfg<real_type>::converters_type converters_train =
        {
            {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt},
            {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt},
            {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}},
            {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}},
            {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}},
            {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "SPECIAL_PART"), [](const char * str){return from_list_xlt({"No", "Maybe", "Yes"}, str);}},
        };
    const num::loadtxtCfg<real_type>::converters_type converters_test =
        {
            {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt},
            {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt},
            {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}},
            {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}},
            {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}},
            {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
        };

    ////////////////////////////////////////////////////////////////////////////

    const array_type i_train_data =
        num::loadtxt(
            std::move(i_training),
            std::move(
                num::loadtxtCfg<real_type>()
                .delimiter(',')
                .converters(num::loadtxtCfg<real_type>::converters_type{converters_train})
            )
        );

    const array_type i_test_data =
        num::loadtxt(
            std::move(i_testing),
            std::move(
                num::loadtxtCfg<real_type>()
                .delimiter(',')
                .converters(num::loadtxtCfg<real_type>::converters_type{converters_test})
            )
        );

    std::vector<std::string> colnames;
    array_type train_data;
    array_type test_data;

    std::tie(colnames, train_data, test_data) = gen_features(raw_colnames, i_train_data, i_test_data);

//    std::cerr << train_data.shape() << test_data.shape() << std::endl;
//    std::copy(colnames.cbegin(), colnames.cend(), std::ostream_iterator<std::string>(std::cerr, "\n"));

    assert(train_data.shape().second == test_data.shape().second + 1);

    const array_type::varray_type train_y_valarr = train_data[train_data.column(colidx(colnames, "SPECIAL_PART"))];
    const std::vector<float> train_y(std::begin(train_y_valarr), std::end(train_y_valarr));

    std::cerr << "train_y size: " << train_y.size() << std::endl;

    train_data = num::del_column(train_data, colidx(colnames, "SPECIAL_PART"));
    colnames.erase(std::find(colnames.begin(), colnames.end(), "SPECIAL_PART"));
    assert(colnames.size() == train_data.shape().second);

    std::cerr << "train_data shape: " << train_data.shape() << std::endl;
    std::cerr << "test_data shape: " << test_data.shape() << std::endl;


    const std::map<const std::string, const std::string> * PARAMS_SET__no[] = {&params::no::prov47};
    std::vector<float> train_y__no;
    std::transform(train_y.cbegin(), train_y.cend(), std::back_inserter(train_y__no),
        [](const float what)
        {
            // quantize train y vector into {0,1}
            return what >= 0.5 ? 1. : 0.;
        }
    );

    const auto y_hat_no = run_binary_estimators(
        std::begin(PARAMS_SET__no), std::end(PARAMS_SET__no),
        time0, train_data, train_y__no, test_data);


    ////////////////////////////////////////////////////////////////////////////

    std::vector<std::size_t> y_hat(y_hat_no);

    const std::string yes_no_maybe[] = {"No", "Maybe", "Yes"};
    std::map<int, std::pair<std::string, std::string>> responses;
    GroupBy gb_test(i_test_data);
    int ix{0};
    for (auto group = gb_test.yield(); group.size() != 0; group = gb_test.yield())
    {
        const std::valarray<real_type> row = i_test_data[i_test_data.row(group.front())];
        const int prod_id = row[colidx(raw_colnames, "PRODUCT_NUMBER")];
        const char segment = row[colidx(raw_colnames, "CUSTOMER_SEGMENT1")];
        const auto response = y_hat[ix++];

        if (responses.count(prod_id))
        {
            if (segment == 0)
            {
                responses[prod_id].first = yes_no_maybe[response];
            }
            else
            {
                responses[prod_id].second = yes_no_maybe[response];
            }
        }
        else
        {
            if (segment == 0)
            {
                responses[prod_id] = {yes_no_maybe[response], "NA"};
            }
            else
            {
                responses[prod_id] = {"NA", yes_no_maybe[response]};
            }
        }
    }

    std::vector<std::string> str_y_hat;
    std::transform(responses.cbegin(), responses.cend(), std::back_inserter(str_y_hat),
        [](const std::pair<int, std::pair<std::string, std::string>> & kv)
        {
            return std::to_string(kv.first) + ',' + kv.second.first + ',' + kv.second.second;
        }
    );

    return str_y_hat;
}
std::vector<std::size_t>
run_binary_estimators(
    const Iterator begin,
    const Iterator end,
    const long int time0,
    const array_type & train_data,
    const std::vector<float> & train_y,
    const array_type & test_data)
{
    constexpr int   TIME_MARGIN{60};
    constexpr int   MAX_TIME{600};
    const int       MAX_TIMESTAMP = time0 + MAX_TIME - TIME_MARGIN;

    std::cerr << std::endl << "Training " << std::distance(begin, end) << " estimator(s)" << std::endl;
    std::cerr << "Total time limit: " << MAX_TIME << " secs" << std::endl;

    // collection of probabilities predicted by each estimator
    std::vector<std::vector<float>> y_hat_proba_set;

    for (auto it = begin; it != end; ++it)
    {
        const auto & PARAMS_p = *it;

        const int MAX_ITER = std::stoi(PARAMS_p->at("n_estimators"));
        int iter{0};

        auto booster = XGB::fit(train_data, train_y, *PARAMS_p,
            [&iter, MAX_ITER, MAX_TIMESTAMP]() -> bool
            {
                const bool running = (iter < MAX_ITER) && (timestamp() < MAX_TIMESTAMP);
                ++iter;
                return running == false;
            }
        );

        if (iter <= MAX_ITER)
        {
            // time exceeded
            std::cerr << "Exceeded allocated time limit after iteration " << iter << " of " << MAX_ITER << " for estimator [" << y_hat_proba_set.size() + 1 << "]" << std::endl;

            // but we'll make the prediction anyway if it's our first estimator :)
            if (y_hat_proba_set.size() == 0)
            {
                y_hat_proba_set.push_back(XGB::predict(booster.get(), test_data));
            }
            break;
        }

        auto proba = XGB::predict(booster.get(), test_data);

        y_hat_proba_set.push_back(proba);

        std::cerr << "Elapsed time: " << timestamp() - time0 << std::endl;
    }

    // array of propabilities accumulated from completed estimators
    std::vector<float> y_hat_proba_cumm(y_hat_proba_set.front().size(), 0.);

    for (std::size_t idx{0}; idx < y_hat_proba_set.size(); ++idx)
    {
        std::transform(y_hat_proba_set[idx].cbegin(), y_hat_proba_set[idx].cend(), y_hat_proba_cumm.begin(),
            y_hat_proba_cumm.begin(),
            [](const float x, const float a)
            {
                return a + x;
            });
    }

    // quantized prediction
    std::vector<std::size_t> y_hat(test_data.shape().first);

    for (std::size_t ix{0}; ix < y_hat.size(); ++ix)
    {
        y_hat[ix] = y_hat_proba_cumm[ix] > 0.5;
    }

    return y_hat;
}