Пример #1
0
void kernel_coalesce_drop(local_matrix_type A) {
  auto numRows = A.numRows();

  scalar_type eps = 0.05;

  // Stage 0: detect Dirichlet rows
  boundary_nodes_type boundaryNodes("boundaryNodes", numRows);
  Kokkos::parallel_for("MueLu:Utils::DetectDirichletRows", numRows, KOKKOS_LAMBDA(const local_ordinal_type row) {
    auto rowView = A.row (row);
    auto length  = rowView.length;

    boundaryNodes(row) = true;
    for (decltype(length) colID = 0; colID < length; colID++)
      if ((rowView.colidx(colID) != row) && (ATS::magnitude(rowView.value(colID)) > 1e-13)) {
        boundaryNodes(row) = false;
        break;
      }
  });
std::vector<std::string>
ElectronicPartsClassification::classifyParts(
    std::vector<std::string> & i_training,
    std::vector<std::string> & i_testing) const
{

    const std::vector<std::string> raw_colnames{"PRODUCT_NUMBER", "CUSTOMER_NUMBER", "TRANSACTION_DATE",
        "PRODUCT_PRICE", "GROSS_SALES", "REGION", "WAREHOUSE", "CUSTOMER_ZIP", "CUSTOMER_SEGMENT1",
        "CUSTOMER_SEGMENT2", "CUSTOMER_TYPE1", "CUSTOMER_TYPE2", "CUSTOMER_MANAGED_LEVEL",
        "CUSTOMER_ACCOUNT_TYPE", "CUSTOMER_FIRST_ORDER_DATE", "PRODUCT_CLASS_ID1",
        "PRODUCT_CLASS_ID2", "PRODUCT_CLASS_ID3", "PRODUCT_CLASS_ID4","BRAND",
        "PRODUCT_ATTRIBUTE_X", "PRODUCT_SALES_UNIT", "SHIPPING_WEIGHT", "TOTAL_BOXES_SOLD",
        "PRODUCT_COST1", "PRODUCT_UNIT_OF_MEASURE", "ORDER_SOURCE", "PRICE_METHOD", "SPECIAL_PART"};


    const auto time0 = timestamp();

    const num::loadtxtCfg<real_type>::converters_type converters_train =
        {
            {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt},
            {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt},
            {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}},
            {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}},
            {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}},
            {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "SPECIAL_PART"), [](const char * str){return from_list_xlt({"No", "Maybe", "Yes"}, str);}},
        };
    const num::loadtxtCfg<real_type>::converters_type converters_test =
        {
            {colidx(raw_colnames, "TRANSACTION_DATE"), date_xlt},
            {colidx(raw_colnames, "CUSTOMER_SEGMENT1"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_TYPE2"), [](const char * str){return from_list_xlt({"A", "B", "C"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_MANAGED_LEVEL"), [](const char * str){return from_list_xlt({"N", "L"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_ACCOUNT_TYPE"), [](const char * str){return from_list_xlt({"ST", "DM"}, str);}},
            {colidx(raw_colnames, "CUSTOMER_FIRST_ORDER_DATE"), date_xlt},
            {colidx(raw_colnames, "BRAND"), [](const char * str){return from_list_xlt({"IN_HOUSE", "NOT_IN_HOUSE"}, str);}},
            {colidx(raw_colnames, "PRODUCT_SALES_UNIT"), [](const char * str){return from_list_xlt({"Y", "N"}, str);}},
            {colidx(raw_colnames, "PRODUCT_UNIT_OF_MEASURE"), [](const char * str){return from_list_xlt({"B", "LB", "EA"}, str);}},
            {colidx(raw_colnames, "ORDER_SOURCE"), [](const char * str){return from_list_xlt({"A", "B"}, str);}},
        };

    ////////////////////////////////////////////////////////////////////////////

    const array_type i_train_data =
        num::loadtxt(
            std::move(i_training),
            std::move(
                num::loadtxtCfg<real_type>()
                .delimiter(',')
                .converters(num::loadtxtCfg<real_type>::converters_type{converters_train})
            )
        );

    const array_type i_test_data =
        num::loadtxt(
            std::move(i_testing),
            std::move(
                num::loadtxtCfg<real_type>()
                .delimiter(',')
                .converters(num::loadtxtCfg<real_type>::converters_type{converters_test})
            )
        );

    std::vector<std::string> colnames;
    array_type train_data;
    array_type test_data;

    std::tie(colnames, train_data, test_data) = gen_features(raw_colnames, i_train_data, i_test_data);

//    std::cerr << train_data.shape() << test_data.shape() << std::endl;
//    std::copy(colnames.cbegin(), colnames.cend(), std::ostream_iterator<std::string>(std::cerr, "\n"));

    assert(train_data.shape().second == test_data.shape().second + 1);

    const array_type::varray_type train_y_valarr = train_data[train_data.column(colidx(colnames, "SPECIAL_PART"))];
    const std::vector<float> train_y(std::begin(train_y_valarr), std::end(train_y_valarr));

    std::cerr << "train_y size: " << train_y.size() << std::endl;

    train_data = num::del_column(train_data, colidx(colnames, "SPECIAL_PART"));
    colnames.erase(std::find(colnames.begin(), colnames.end(), "SPECIAL_PART"));
    assert(colnames.size() == train_data.shape().second);

    std::cerr << "train_data shape: " << train_data.shape() << std::endl;
    std::cerr << "test_data shape: " << test_data.shape() << std::endl;


    const std::map<const std::string, const std::string> * PARAMS_SET__no[] = {&params::no::prov47};
    std::vector<float> train_y__no;
    std::transform(train_y.cbegin(), train_y.cend(), std::back_inserter(train_y__no),
        [](const float what)
        {
            // quantize train y vector into {0,1}
            return what >= 0.5 ? 1. : 0.;
        }
    );

    const auto y_hat_no = run_binary_estimators(
        std::begin(PARAMS_SET__no), std::end(PARAMS_SET__no),
        time0, train_data, train_y__no, test_data);


    ////////////////////////////////////////////////////////////////////////////

    std::vector<std::size_t> y_hat(y_hat_no);

    const std::string yes_no_maybe[] = {"No", "Maybe", "Yes"};
    std::map<int, std::pair<std::string, std::string>> responses;
    GroupBy gb_test(i_test_data);
    int ix{0};
    for (auto group = gb_test.yield(); group.size() != 0; group = gb_test.yield())
    {
        const std::valarray<real_type> row = i_test_data[i_test_data.row(group.front())];
        const int prod_id = row[colidx(raw_colnames, "PRODUCT_NUMBER")];
        const char segment = row[colidx(raw_colnames, "CUSTOMER_SEGMENT1")];
        const auto response = y_hat[ix++];

        if (responses.count(prod_id))
        {
            if (segment == 0)
            {
                responses[prod_id].first = yes_no_maybe[response];
            }
            else
            {
                responses[prod_id].second = yes_no_maybe[response];
            }
        }
        else
        {
            if (segment == 0)
            {
                responses[prod_id] = {yes_no_maybe[response], "NA"};
            }
            else
            {
                responses[prod_id] = {"NA", yes_no_maybe[response]};
            }
        }
    }

    std::vector<std::string> str_y_hat;
    std::transform(responses.cbegin(), responses.cend(), std::back_inserter(str_y_hat),
        [](const std::pair<int, std::pair<std::string, std::string>> & kv)
        {
            return std::to_string(kv.first) + ',' + kv.second.first + ',' + kv.second.second;
        }
    );

    return str_y_hat;
}
std::tuple<std::vector<std::string>, array_type, array_type>
gen_features(
    const std::vector<std::string> & i_colnames,
    const array_type & i_train_data,
    const array_type & i_test_data)
{
    typedef std::valarray<real_type> varray_type;

    /*
     * generate feature vector across all groups for a single attribute column
     */
    auto gen_attribute = [&i_colnames](GroupBy & gb, const array_type & arr, const std::string & colname) -> varray_type
    {
        const std::size_t cix = colidx(i_colnames, colname);
        varray_type vec(NAN, gb.size());

        std::size_t vix{0};
        for (auto group = gb.yield(); group.size() != 0; group = gb.yield())
        {
            const varray_type row = arr[arr.row(group.front())];
            vec[vix++] = row[cix];
        }

        gb.rewind();

        return vec;
    };

    auto gen_attributes = [&i_colnames, &gen_attribute](GroupBy & gb, const array_type & arr, const std::vector<std::string> & att_names) -> array_type
    {
        const varray_type att1 = gen_attribute(gb, arr, att_names.front());
        array_type result({att1.size(), 1}, att1);

        for (auto it = std::next(att_names.cbegin()); it != att_names.cend(); ++it)
        {
            const varray_type att = gen_attribute(gb, arr, *it);
            result = num::add_column(result, att);
        }
        return result;
    };


    const std::vector<std::string> att_names{"PRODUCT_CLASS_ID1", "BRAND", "PRODUCT_SALES_UNIT", "PRODUCT_UNIT_OF_MEASURE"};

    ////////////////////////////////////////////////////////////////////////////

    GroupBy gb_train(i_train_data);

    array_type train_data = gen_attributes(gb_train, i_train_data, att_names);

    std::vector<std::string> colnames;
    std::copy(att_names.cbegin(), att_names.cend(), std::back_inserter(colnames));

    ////////////////////////////////////////////////////////////////////////////

    GroupBy gb_test(i_test_data);

    array_type test_data = gen_attributes(gb_test, i_test_data, att_names);

    ////////////////////////////////////////////////////////////////////////////

    auto gen_distribution = [&i_colnames](GroupBy & gb, const array_type & arr, const std::string & colname, const int dict_sz, const int offset) -> array_type
    {
        const std::size_t cix = colidx(i_colnames, colname);

        array_type result({gb.size(), dict_sz}, 0.);

        std::size_t gbix{0};
        for (auto group = gb.yield(); group.size() != 0; group = gb.yield())
        {
            varray_type dict(0., dict_sz);

            for (const auto rix : group)
            {
                const int item = arr[arr.row(rix)][cix] - offset;
                assert(0 <= item && item < dict_sz);

                ++dict[item];
            }

            dict /= dict.sum();
            result[result.row(gbix++)] = dict;
        }

        gb.rewind();

        return result;
    };

    const std::tuple<std::string, int, int> dist_db[] =
        {
            std::make_tuple("PRICE_METHOD", 5, 1),
            std::make_tuple("ORDER_SOURCE", 2, 0),
            std::make_tuple("CUSTOMER_ACCOUNT_TYPE", 2, 0),
            std::make_tuple("CUSTOMER_MANAGED_LEVEL", 2, 0),
            std::make_tuple("CUSTOMER_TYPE2", 3, 0),
            std::make_tuple("CUSTOMER_TYPE1", 3, 1),
        };

    for (const auto & descriptor : dist_db)
    {
        const auto dist = gen_distribution(gb_train, i_train_data,
            std::get<0>(descriptor),
            std::get<1>(descriptor),
            std::get<2>(descriptor));
        train_data = num::add_columns(train_data, dist);
    }

    for (const auto & descriptor : dist_db)
    {
        const auto dist = gen_distribution(gb_test, i_test_data,
            std::get<0>(descriptor),
            std::get<1>(descriptor),
            std::get<2>(descriptor));
        test_data = num::add_columns(test_data, dist);
    }

    for (const auto & descriptor : dist_db)
    {
        for (int ix{0}; ix < std::get<1>(descriptor); ++ix)
        {
            colnames.push_back(std::get<0>(descriptor) + '_' + std::to_string(ix + 1));
        }
    }

    ////////////////////////////////////////////////////////////////////////////

    auto min_max_std = [&i_colnames](GroupBy & gb, const array_type & arr) -> array_type
    {
        const std::size_t pcost1_cix = colidx(i_colnames, "PRODUCT_COST1");
        const std::size_t boxes_cix = colidx(i_colnames, "TOTAL_BOXES_SOLD");
        const std::size_t price_cix = colidx(i_colnames, "PRODUCT_PRICE");
        const std::size_t gross_cix = colidx(i_colnames, "GROSS_SALES");
        const std::size_t unit_cix = colidx(i_colnames, "PRODUCT_UNIT_OF_MEASURE");

        constexpr std::size_t NFEAT = 3 + 3 + 4;

        array_type result({gb.size(), NFEAT}, 0.);

        std::size_t gbix{0};
        for (auto group = gb.yield(); group.size() != 0; group = gb.yield())
        {
            varray_type row(0., NFEAT);
            std::size_t rix{0};

            const std::valarray<std::size_t> indirect(group.data(), group.size());

            const varray_type boxes_sold = arr[arr.column(boxes_cix)][indirect];
            const varray_type signed_pcost = arr[arr.column(pcost1_cix)][indirect];
            const varray_type pcost1 = std::abs(signed_pcost);
            const varray_type pcost1_per_item = pcost1 / boxes_sold;
            const real_type pcost1_mean = num::mean(pcost1_per_item);
            const real_type pcost1_std = num::std(pcost1_per_item);

            row[rix++] = pcost1_per_item.min() / pcost1_mean;
            row[rix++] = pcost1_per_item.max() / pcost1_mean;
            row[rix++] = pcost1_std / pcost1_mean;

            const varray_type signed_price = arr[arr.column(price_cix)][indirect];
            const varray_type price = std::abs(signed_price);
            const real_type price_mean = num::mean(price);
            const real_type price_std = num::std(price);

            row[rix++] = price.min() / price_mean;
            row[rix++] = price.max() / price_mean;
            row[rix++] = price_std / price_mean;

            const varray_type signed_gross_sales = arr[arr.column(gross_cix)][indirect];
            const varray_type commision = arr[arr.row(group.front())][unit_cix] < 2 ?
                (varray_type)(price / pcost1_per_item) :
                (varray_type)(std::abs(signed_gross_sales) / pcost1);
            const real_type commision_mean = num::mean(commision);
            const real_type commision_std = num::std(commision);

            row[rix++] = commision_mean;
            row[rix++] = commision.min() / commision_mean;
            row[rix++] = commision.max() / commision_mean;
            row[rix++] = commision_std / commision_mean;

            result[result.row(gbix)] = row;
        }

        gb.rewind();

        return result;
    };

    {
        const auto stat = min_max_std(gb_train, i_train_data);
        train_data = num::add_columns(train_data, stat);
    }
    {
        const auto stat = min_max_std(gb_test, i_test_data);
        test_data = num::add_columns(test_data, stat);
    }

    const std::string stat_colnames[] = {
        "PCOST1_REL_MIN", "PCOST1_REL_MAX", "PCOST1_REL_STD",
        "PRICE_REL_MIN", "PRICE_REL_MAX", "PRICE_REL_STD",
        "COMMN_MEAN", "COMMN_REL_MIN", "COMMN_REL_MAX", "COMMN_REL_STD"};
    colnames.insert(colnames.end(), std::begin(stat_colnames), std::end(stat_colnames));

    ////////////////////////////////////////////////////////////////////////////


    const varray_type special_part = gen_attribute(gb_train, i_train_data, "SPECIAL_PART");
    colnames.push_back("SPECIAL_PART");
    train_data = num::add_column(train_data, special_part);

    return std::make_tuple(colnames, train_data, test_data);
}