key_type key(const array_type & df, const std::size_t index) const { if (df.shape().second == 0) { return {0, 0}; } else { const int prod_id = df[df.row(index)][0]; // TODO, hardcoded const char segment = df[df.row(index)][8]; // TODO, hardcoded return {prod_id, segment}; } }
std::vector<std::string> ElectronicPartsClassification::classifyParts( std::vector<std::string> & i_training, std::vector<std::string> & i_testing) const { const std::vector<std::string> raw_colnames{"PRODUCT_NUMBER", "CUSTOMER_NUMBER", "TRANSACTION_DATE", "PRODUCT_PRICE", "GROSS_SALES", "REGION", "WAREHOUSE", "CUSTOMER_ZIP", "CUSTOMER_SEGMENT1", "CUSTOMER_SEGMENT2", "CUSTOMER_TYPE1", "CUSTOMER_TYPE2", "CUSTOMER_MANAGED_LEVEL", "CUSTOMER_ACCOUNT_TYPE", "CUSTOMER_FIRST_ORDER_DATE", "PRODUCT_CLASS_ID1", "PRODUCT_CLASS_ID2", "PRODUCT_CLASS_ID3", "PRODUCT_CLASS_ID4","BRAND", "PRODUCT_ATTRIBUTE_X", "PRODUCT_SALES_UNIT", "SHIPPING_WEIGHT", "TOTAL_BOXES_SOLD", "PRODUCT_COST1", "PRODUCT_UNIT_OF_MEASURE", "ORDER_SOURCE", "PRICE_METHOD", "SPECIAL_PART"}; const auto time0 = timestamp(); const num::loadtxtCfg<real_type>::converters_type converters_train = { {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt}, {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}}, {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}}, {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}}, {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}}, {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt}, {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}}, {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}}, {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}}, {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}}, {colidx(raw_colnames, "SPECIAL_PART"), [](const char * str){return from_list_xlt({"No", "Maybe", "Yes"}, str);}}, }; const num::loadtxtCfg<real_type>::converters_type converters_test = { {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt}, {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}}, {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}}, {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}}, {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}}, {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt}, {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}}, {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}}, {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}}, {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}}, }; //////////////////////////////////////////////////////////////////////////// const array_type i_train_data = num::loadtxt( std::move(i_training), std::move( num::loadtxtCfg<real_type>() .delimiter(',') .converters(num::loadtxtCfg<real_type>::converters_type{converters_train}) ) ); const array_type i_test_data = num::loadtxt( std::move(i_testing), std::move( num::loadtxtCfg<real_type>() .delimiter(',') .converters(num::loadtxtCfg<real_type>::converters_type{converters_test}) ) ); std::vector<std::string> colnames; array_type train_data; array_type test_data; std::tie(colnames, train_data, test_data) = gen_features(raw_colnames, i_train_data, i_test_data); // std::cerr << train_data.shape() << test_data.shape() << std::endl; // std::copy(colnames.cbegin(), colnames.cend(), std::ostream_iterator<std::string>(std::cerr, "\n")); assert(train_data.shape().second == test_data.shape().second + 1); const array_type::varray_type train_y_valarr = train_data[train_data.column(colidx(colnames, "SPECIAL_PART"))]; const std::vector<float> train_y(std::begin(train_y_valarr), std::end(train_y_valarr)); std::cerr << "train_y size: " << train_y.size() << std::endl; train_data = num::del_column(train_data, colidx(colnames, "SPECIAL_PART")); colnames.erase(std::find(colnames.begin(), colnames.end(), "SPECIAL_PART")); assert(colnames.size() == train_data.shape().second); std::cerr << "train_data shape: " << train_data.shape() << std::endl; std::cerr << "test_data shape: " << test_data.shape() << std::endl; const std::map<const std::string, const std::string> * PARAMS_SET__no[] = {¶ms::no::prov47}; std::vector<float> train_y__no; std::transform(train_y.cbegin(), train_y.cend(), std::back_inserter(train_y__no), [](const float what) { // quantize train y vector into {0,1} return what >= 0.5 ? 1. : 0.; } ); const auto y_hat_no = run_binary_estimators( std::begin(PARAMS_SET__no), std::end(PARAMS_SET__no), time0, train_data, train_y__no, test_data); //////////////////////////////////////////////////////////////////////////// std::vector<std::size_t> y_hat(y_hat_no); const std::string yes_no_maybe[] = {"No", "Maybe", "Yes"}; std::map<int, std::pair<std::string, std::string>> responses; GroupBy gb_test(i_test_data); int ix{0}; for (auto group = gb_test.yield(); group.size() != 0; group = gb_test.yield()) { const std::valarray<real_type> row = i_test_data[i_test_data.row(group.front())]; const int prod_id = row[colidx(raw_colnames, "PRODUCT_NUMBER")]; const char segment = row[colidx(raw_colnames, "CUSTOMER_SEGMENT1")]; const auto response = y_hat[ix++]; if (responses.count(prod_id)) { if (segment == 0) { responses[prod_id].first = yes_no_maybe[response]; } else { responses[prod_id].second = yes_no_maybe[response]; } } else { if (segment == 0) { responses[prod_id] = {yes_no_maybe[response], "NA"}; } else { responses[prod_id] = {"NA", yes_no_maybe[response]}; } } } std::vector<std::string> str_y_hat; std::transform(responses.cbegin(), responses.cend(), std::back_inserter(str_y_hat), [](const std::pair<int, std::pair<std::string, std::string>> & kv) { return std::to_string(kv.first) + ',' + kv.second.first + ',' + kv.second.second; } ); return str_y_hat; }