void gmm_compute_p (int n, const float * v, const gmm_t * g, float * p, int flags) { if(n==0) return; /* sgemm doesn't like empty matrices */ long i, j, l; double dtmp; long d=g->d, k=g->k; /* p_i(x|\lambda)'s denominator, eq (7) */ float * logdetnr = fvec_new(k); for (j = 0 ; j < k ; j++) { logdetnr[j] = -d / 2.0 * log (2 * M_PI); for (i = 0 ; i < d ; i++) logdetnr[j] -= 0.5 * log (g->sigma[j * d + i]); } /* compute all probabilities in log domain */ /* compute squared Mahalanobis distances (result in p), log of numerator eq (7) */ if(0) { /* simple & slow */ for (i = 0 ; i < n ; i++) { for (j = 0 ; j < k ; j++) { dtmp = 0; for (l = 0 ; l < d ; l++) { dtmp += sqr (v[i * d + l] - g->mu[j * d + l]) / g->sigma[j * d + l]; } p[i * k + j] = dtmp; } } } else { /* complicated & fast */ compute_mahalanobis_sqr(n,k,d,g->mu,g->sigma,v,p); } float *lg = (float*)malloc(sizeof(float) * k); if(flags & GMM_FLAGS_W) { for (j = 0 ; j < k ; j++) lg[j] = log(g->w[j]); } else memset(lg, 0, sizeof(float) * k); for (i = 0 ; i < n ; i++) { /* p contains log(p_j(x|\lambda)) eq (7) */ for (j = 0 ; j < k ; j++) { p[i * k + j] = logdetnr[j] - 0.5 * p[i * k + j] + lg[j]; } } free(lg); softmax_ref(k, n, p, p, NULL); free(logdetnr); }
bool run_softmax_test( const nn_device_interface_0_t &di, uint_least32_t num_samples, uint_least32_t num_batches) // length of input to be processed (softmax normalize) { // Input generation (input feature maps to have pooling run on it) float *input = nullptr; generate_input_data( input, num_samples, 1, 1, num_batches ); // length of output is the same as input float *cpu_outputs; init_data( cpu_outputs, num_samples * num_batches, 0.0f ); float *gpu_outputs; init_data( gpu_outputs, num_samples * num_batches, 0.0f ); softmax_ref( cpu_outputs, input, num_samples, num_batches ); // First workload item is input one (entity producing input data) nn_gpu_workload_item *input_workload_item = nullptr; initialize_input_workload_item( input_workload_item); // Specify layout of softmax workload nn_workload_data_layout_t workload_layout = { { 0, 0, 0, 0, 0, 0 }, // tile in log2(size) { 0, 0, 0, 0, 0, 0 }, // alignment { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, NN_DATATYPE_FLOAT }; // specify dimensions of input, output nn_workload_data_coords_t workload_coords = { num_batches, num_samples, 1, 1, 1, 1 }; size_t output_coords[2] = {num_samples, num_batches}; // Now create softmax workload_item giving as input input_workload_item nn_gpu_workload_item *softmax_workload_item = nullptr; initialize_layer_workload_item( softmax_workload_item, input_workload_item, workload_layout, workload_coords ); softmax_workload_item->type = NN_WORK_ITEM_TYPE_SOFTMAX; // Now create output workload_item giving softmax workload item as precedessor nn_gpu_workload_item *output_workload_item = nullptr; initialize_output_workload_item( output_workload_item, softmax_workload_item ); // Make a workload using two above created workload_items nn_gpu_workload *gpu_workload = nullptr; create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, input_workload_item, softmax_workload_item, output_workload_item ); using io_data = std::unique_ptr<nn::data<float, 0>>; io_data execute_inputs[1]; io_data execute_outputs[1]; execute_inputs[0] = io_data(new nn::data<float, 0>(input, output_coords, 2)); execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, output_coords, 2)); EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload, ( void ** )execute_inputs, ( void ** )execute_outputs, nullptr ) ); nn_workload_data_coords_t output_view_begin(0, 0, 0, 0, 0, 0); nn_workload_data_coords_t output_view_end(num_batches - 1, num_samples - 1, 0, 0, 0, 0); // Compare CPU(reference) output with the one returned by GPU EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) ); EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload)); #ifdef __linux__ free( cpu_outputs ); cpu_outputs = nullptr; free( gpu_outputs ); gpu_outputs = nullptr; free( input ); input = nullptr; #else _aligned_free( cpu_outputs ); cpu_outputs = nullptr; _aligned_free( gpu_outputs ); gpu_outputs = nullptr; _aligned_free( input ); input = nullptr; #endif //__linux__ return true; }