示例#1
0
文件: gmm.c 项目: Erotemic/yael
void gmm_compute_p (int n, const float * v, 
                    const gmm_t * g, 
                    float * p,
                    int flags)
{
  if(n==0) return; /* sgemm doesn't like empty matrices */

  long i, j, l;
  double dtmp;
  long d=g->d, k=g->k;

  /* p_i(x|\lambda)'s denominator, eq (7) */
  float * logdetnr = fvec_new(k);

  for (j = 0 ; j < k ; j++) {
    logdetnr[j] = -d / 2.0 * log (2 * M_PI);
    for (i = 0 ; i < d ; i++)
      logdetnr[j] -= 0.5 * log (g->sigma[j * d + i]);
  }

  /* compute all probabilities in log domain */

  /* compute squared Mahalanobis distances (result in p), log of numerator eq (7)  */

  if(0) { /* simple & slow */
    for (i = 0 ; i < n ; i++) {
      for (j = 0 ; j < k ; j++) {
        dtmp = 0;
        for (l = 0 ; l < d ; l++) {
          dtmp += sqr (v[i * d + l] - g->mu[j * d + l]) / g->sigma[j * d + l];
        }
        p[i * k + j] = dtmp;
      }
    }
  } else { /* complicated & fast */
    compute_mahalanobis_sqr(n,k,d,g->mu,g->sigma,v,p); 
  }

  float *lg = (float*)malloc(sizeof(float) *  k); 

  if(flags & GMM_FLAGS_W) {
    for (j = 0 ; j < k ; j++) 
      lg[j] = log(g->w[j]);      
  } else
    memset(lg, 0, sizeof(float) * k);
  
  for (i = 0 ; i < n ; i++) {      
    /* p contains log(p_j(x|\lambda)) eq (7) */
    for (j = 0 ; j < k ; j++) {
      p[i * k + j] = logdetnr[j] - 0.5 * p[i * k + j] + lg[j];
    }
  }

  free(lg);
  softmax_ref(k, n, p, p, NULL);

  free(logdetnr);
}
bool run_softmax_test( const nn_device_interface_0_t &di,
                       uint_least32_t                num_samples,
                       uint_least32_t                num_batches) // length of input to be  processed (softmax normalize)
{
    // Input generation (input feature maps to have pooling run on it)
    float *input = nullptr;
    generate_input_data( input, num_samples, 1, 1, num_batches );

    // length of output is the same as input

    float *cpu_outputs;
    init_data( cpu_outputs, num_samples * num_batches, 0.0f );

    float *gpu_outputs;
    init_data( gpu_outputs, num_samples * num_batches, 0.0f );

    softmax_ref( cpu_outputs, input, num_samples, num_batches );

    // First workload item is input one (entity producing input data)
    nn_gpu_workload_item *input_workload_item = nullptr;
    initialize_input_workload_item( input_workload_item);

    // Specify layout of softmax workload
    nn_workload_data_layout_t workload_layout = {
        { 0, 0, 0, 0, 0, 0 }, // tile in log2(size)
        { 0, 0, 0, 0, 0, 0 }, // alignment
        { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q },
        NN_DATATYPE_FLOAT
    };

    // specify dimensions of input, output
    nn_workload_data_coords_t workload_coords =
    {
        num_batches,
        num_samples,
        1,
        1,
        1,
        1
    };

    size_t output_coords[2] = {num_samples, num_batches};

    // Now create softmax workload_item giving as input input_workload_item
    nn_gpu_workload_item *softmax_workload_item = nullptr;
    initialize_layer_workload_item( softmax_workload_item, input_workload_item, workload_layout, workload_coords );
    softmax_workload_item->type        = NN_WORK_ITEM_TYPE_SOFTMAX;

    // Now create output workload_item giving softmax workload item as precedessor
    nn_gpu_workload_item *output_workload_item = nullptr;
    initialize_output_workload_item( output_workload_item, softmax_workload_item );

    // Make a workload using two above created workload_items
    nn_gpu_workload *gpu_workload = nullptr;
    create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, input_workload_item, softmax_workload_item, output_workload_item );

    using io_data = std::unique_ptr<nn::data<float, 0>>;
    io_data execute_inputs[1];
    io_data execute_outputs[1];

    execute_inputs[0]  = io_data(new nn::data<float, 0>(input, output_coords, 2));
    execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, output_coords, 2));

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload,
                                             ( void ** )execute_inputs,
                                             ( void ** )execute_outputs, nullptr ) );

    nn_workload_data_coords_t output_view_begin(0, 0, 0, 0, 0, 0);
    nn_workload_data_coords_t output_view_end(num_batches - 1, num_samples - 1, 0, 0, 0, 0);

    // Compare CPU(reference) output with the one returned by GPU
    EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) );

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload));

#ifdef __linux__
    free( cpu_outputs );
    cpu_outputs = nullptr;
    free( gpu_outputs );
    gpu_outputs = nullptr;
    free( input );
    input = nullptr;
#else
    _aligned_free( cpu_outputs );
    cpu_outputs = nullptr;
    _aligned_free( gpu_outputs );
    gpu_outputs = nullptr;
    _aligned_free( input );
    input = nullptr;
#endif //__linux__

    return true;

}