示例#1
0
void NoiseLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  Dtype* rand_vec_data = rand_vec_.mutable_cpu_data();
  const int count = bottom[0]->count();
  // create gaussian noise and add to top, in-place/ or not the same
  if (sigma_> 0) {
    caffe_rng_gaussian(count, Dtype(0), sigma_, rand_vec_data);
  } else if (bottom[0] == top[0]) {
  } else {
    caffe_set(count, Dtype(0), rand_vec_data);
  }
  // use copy not add
  caffe_add(count, rand_vec_data, bottom_data, top_data);
}
void DropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->cpu_data();
  Dtype* top_data = top[0]->mutable_cpu_data();
  Dtype* mask = rand_vec_->mutable_cpu_data();
  const int count = rand_vec_->count();
  if (this->phase_ == TRAIN) {
	  switch (drop_type_){
	  case DropoutParameter_DropType_BERNOULLI:
	  {
	    // Create random numbers
	    caffe_rng_bernoulli(count, Dtype(1. - threshold_), mask);
	    break;
	  }
	  case DropoutParameter_DropType_GAUSSIAN:
	  {
	   caffe_rng_gaussian(count, Dtype(mu_), Dtype(sigma_), mask);
	   // clip to be in [0,1]
	   for (int i = 0; i < rand_vec_->count(); ++i){
	  	 Dtype m = mask[i];
	  	 mask[i] = m > 1 ? 1 : (m < 0 ? 0 : m);
	   }
	   break;
	  }
	  case DropoutParameter_DropType_UNIFORM:
	  {
	    caffe_rng_uniform(count, Dtype(a_), Dtype(b_), mask);
		break;
	  }
	  }
	  if (drop_batch_){
		  Dtype drop = mask[0];
		  caffe_copy(top[0]->count(), bottom_data, top_data);
		  caffe_scal(top[0]->count(), Dtype(scale_ * drop), top_data);
	  }
	  else{
		  vector<Blob<Dtype>*> scale_bottom(2, NULL);
		  scale_bottom[0] = bottom[0];
		  scale_bottom[1] = rand_vec_;
		  const vector<Blob<Dtype>*> scale_top(1, top[0]);
		  scale_layer_->Forward(scale_bottom, scale_top);
		  caffe_scal(top[0]->count(), scale_, top_data);
	  }
  } else {
    caffe_copy(bottom[0]->count(), bottom_data, top_data);
  }
}
 void RngGaussianFill(const Dtype mu, const Dtype sigma, void* cpu_data) {
   Dtype* rng_data = static_cast<Dtype*>(cpu_data);
   caffe_rng_gaussian(sample_size_, mu, sigma, rng_data);
 }
示例#4
0
void Device::rng_gaussian_double(const uint_tp n, const double mu,
                                     const double sigma, vptr<double> r) {
  vector<double> random(n);  // NOLINT
  caffe_rng_gaussian(n, mu, sigma, &random[0]);
  this->memcpy(sizeof(double) * n, &random[0], vptr<void>(r));
}
示例#5
0
void Device::rng_gaussian_half(const uint_tp n, const half_fp mu,
                               const half_fp sigma, vptr<half_fp> r) {
  vector<half_fp> random(n);  // NOLINT
  caffe_rng_gaussian(n, mu, sigma, &random[0]);
  this->memcpy(sizeof(half_fp) * n, &random[0], vptr<void>(r));
}
示例#6
0
void Device::rng_gaussian_float(const uint_tp n, const float mu,
                                   const float sigma, vptr<float> r) {
  vector<float> random(n);  // NOLINT
  caffe_rng_gaussian(n, mu, sigma, &random[0]);
  this->memcpy(sizeof(float) * n, &random[0], vptr<void>(r));
}
示例#7
0
TYPED_TEST(LibDNNBlasTest, TestGemvComparativeCPUGPU) {
  Device *dc = Caffe::GetDefaultDevice();

  TypeParam eps = 0.0;
  if (std::is_same<TypeParam, half_fp>::value) {
    eps = EPS_HALF;
  }
  if (std::is_same<TypeParam, float>::value) {
    eps = EPS_FLOAT;
  }
  if (std::is_same<TypeParam, double>::value) {
    eps = EPS_DOUBLE;
  }

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);

  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp M = dimsRand(rngen);
    int_tp N = dimsRand(rngen);

    CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans;

    bool has_alpha = boolRand(rngen);
    TypeParam alpha_val = factorRand(rngen) / 100.0;
    bool has_beta = boolRand(rngen);
    TypeParam beta_val = factorRand(rngen) / 100.0;

    vector<int_tp> A_shape(4, 1);
    vector<int_tp> x_shape(4, 1);
    vector<int_tp> y_shape(4, 1);

    A_shape[2] = M;
    A_shape[3] = N;
    x_shape[3] = trans_A == CblasTrans ? M : N;
    y_shape[3] = trans_A == CblasTrans ? N : M;

    Blob<TypeParam> A(A_shape, Caffe::GetDefaultDevice());
    Blob<TypeParam> x(x_shape, Caffe::GetDefaultDevice());
    Blob<TypeParam> y_GPU(y_shape, Caffe::GetDefaultDevice());
    Blob<TypeParam> y_CPU(y_shape, Caffe::GetDefaultDevice());

    caffe_rng_gaussian(M * N, (TypeParam)0.0, (TypeParam)0.25,
                       A.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? M : N, (TypeParam)0.0,
                       (TypeParam)0.25, x.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? N : M, (TypeParam)0.0,
                       (TypeParam)0.25, y_CPU.mutable_cpu_data());
    caffe_copy(trans_A == CblasTrans ? N : M, y_CPU.cpu_data(),
               y_GPU.mutable_cpu_data());

    std::cout << "==== Test Case " << testIdx << " ====" << std::endl;
    std::cout << "M: " << M << " N: " << N << std::endl;
    std::cout << "alpha: " << (has_alpha ? alpha_val : (TypeParam)1.0) << " "
              << "beta: " << (has_beta ? beta_val : (TypeParam)0.0)
              << std::endl;
    std::cout << "trans A: " << (trans_A == CblasTrans) << std::endl;

    dc->GetLibDNNBlas<TypeParam, TypeParam>()->gemv(
                trans_A,
                M, N,
                has_alpha ? alpha_val: (TypeParam)1.,
                A.gpu_data(), x.gpu_data(),
                has_beta ? beta_val : (TypeParam)0.,
                y_GPU.mutable_gpu_data());

    caffe_gemv<TypeParam>(
                trans_A,
                M, N,
                has_alpha ? alpha_val: (TypeParam)1.,
                A.cpu_data(), x.cpu_data(),
                has_beta ? beta_val : (TypeParam)0.,
                y_CPU.mutable_cpu_data());

    for (int_tp i = 0; i < (trans_A == CblasTrans ? N : M); ++i) {
      EXPECT_NEAR(y_CPU.cpu_data()[i], y_GPU.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(y_CPU.cpu_data()[i] - y_GPU.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}
示例#8
0
TYPED_TEST(QuantBlasTest, TestAxpbyComparativeFloatQuant) {
  typedef typename TypeParam::Dtype Dtype;

  // Expect at most 5% error
  float percentile_eps = 0.05;

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  // Need to test > 64 dimension
  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);
  std::uniform_real_distribution<float> valRand(-2.0, 2.0);


  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp N = dimsRand(rngen);

    bool has_alpha = boolRand(rngen);
    bool has_beta = has_alpha ? boolRand(rngen) : true;

    bool alpha_with_quant = boolRand(rngen) && has_alpha;
    bool beta_with_quant = boolRand(rngen) && has_beta;

    float alpha_val;
    float beta_val;

    if (has_alpha) {
      alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0);
    } else {
      alpha_val = 0.0;
    }

    if (has_beta) {
      beta_val = beta_with_quant ? valRand(rngen) : float(1.0);
    } else {
      beta_val = 0.0;
    }

    vector<int_tp> x_shape(1, 1);
    vector<int_tp> y_shape(1, 1);

    x_shape[0] = N;
    y_shape[0] = N;

    Blob<float> x(x_shape, Caffe::GetDefaultDevice());
    Blob<float> y(y_shape, Caffe::GetDefaultDevice());
    Blob<float> y_result(y_shape, Caffe::GetDefaultDevice());

    Blob<Dtype> x_quant(x_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> y_quant(y_shape, Caffe::GetDefaultDevice());

    Blob<float> y_unquant(y_shape, Caffe::GetDefaultDevice());


    caffe_rng_gaussian(N, (float)0.0, (float)0.5, x.mutable_cpu_data());
    caffe_rng_gaussian(N, (float)0.0, (float)0.5, y.mutable_cpu_data());

    caffe_copy(N, y.cpu_data(), y_result.mutable_cpu_data());

    QuantizerParameter qpm_x;
    QuantizerParameter qpm_y;
    QuantizerParameter qpm_alpha;
    QuantizerParameter qpm_beta;
    qpm_x.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_y.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_beta.set_mode(CAFFE_QUANT_OBSERVE);

    Quantizer<float, Dtype> xq(qpm_x);
    Quantizer<float, Dtype> yq(qpm_y);
    Quantizer<float, Dtype> alphaq(qpm_alpha);
    Quantizer<float, Dtype> betaq(qpm_beta);

    // Normal GEMM
    caffe_axpby<float>(N, alpha_val, x.cpu_data(), beta_val,
                       y_result.mutable_cpu_data());


    // Observe all values that will be relevant for quantization
    xq.ObserveIn_cpu(N, x.cpu_data());
    yq.ObserveIn_cpu(N, y.cpu_data());
    yq.ObserveIn_cpu(N, y_result.cpu_data());
    alphaq.ObserveIn_cpu(1, &alpha_val);
    betaq.ObserveIn_cpu(1, &beta_val);

    // Apply observed values to the quantizer
    xq.update();
    yq.update();
    alphaq.update();
    betaq.update();

    // Quantize A, B and C
    xq.Forward_cpu(N, x.cpu_data(), x_quant.mutable_cpu_data());
    yq.Forward_cpu(N, y.cpu_data(), y_quant.mutable_cpu_data());

    Dtype alpha_val_quant = has_alpha;
    Dtype beta_val_quant = has_beta;

    // Quantize alpha
    if (alpha_with_quant) {
      alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant);
    }

    // Quantize beta
    if (beta_with_quant) {
      betaq.Forward_cpu(1, &beta_val, &beta_val_quant);
    }

    if (Caffe::mode() == Caffe::Brew::CPU) {
      // TODO: Not implemented yet
      return;
      /*caffe_axpby<Dtype>(N, alpha_val_quant, x_quant.cpu_data(),
                  beta_val_quant, y_quant.mutable_cpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));*/
    } else {
      Caffe::GetDefaultDevice()->template axpby<Dtype>(N,
                  alpha_val_quant, x_quant.gpu_data(),
                  beta_val_quant, y_quant.mutable_gpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));
    }

    yq.Backward_cpu(N, y_quant.cpu_data(), y_unquant.mutable_cpu_data());

    const QuantizerValues cqv = yq.in_quantizer_values();
    float eps = std::max(std::abs(cqv.get_max<float>()),
                         std::abs(cqv.get_min<float>())) * percentile_eps;

    for (int_tp i = 0; i < N; ++i) {
      EXPECT_NEAR(y_unquant.cpu_data()[i], y_result.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(y_unquant.cpu_data()[i] - y_result.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}
示例#9
0
TYPED_TEST(QuantBlasTest, TestGemmComparativeFloatQuant) {
  typedef typename TypeParam::Dtype Dtype;

  // Expect at most 5% error
  float percentile_eps = 0.05;

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  // Need to test > 64 dimension
  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);
  std::uniform_real_distribution<float> valRand(-2.0, 2.0);


  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp M = dimsRand(rngen);
    int_tp N = dimsRand(rngen);
    int_tp K = dimsRand(rngen);

    CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans;
    CBLAS_TRANSPOSE trans_B = boolRand(rngen) ? CblasTrans : CblasNoTrans;

    bool has_alpha = boolRand(rngen);
    bool has_beta = has_alpha ? boolRand(rngen) : true;

    bool alpha_with_quant = boolRand(rngen) && has_alpha;
    bool beta_with_quant = boolRand(rngen) && has_beta;

    float alpha_val;
    float beta_val;

    if (has_alpha) {
      alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0);
    } else {
      alpha_val = 0.0;
    }

    if (has_beta) {
      beta_val = beta_with_quant ? valRand(rngen) : float(1.0);
    } else {
      beta_val = 0.0;
    }

    vector<int_tp> A_shape(4, 1);
    vector<int_tp> B_shape(4, 1);
    vector<int_tp> C_shape(4, 1);

    A_shape[2] = M;
    A_shape[3] = K;
    B_shape[2] = K;
    B_shape[3] = N;
    C_shape[2] = M;
    C_shape[3] = N;

    Blob<float> A(A_shape, Caffe::GetDefaultDevice());
    Blob<float> B(B_shape, Caffe::GetDefaultDevice());
    Blob<float> C(C_shape, Caffe::GetDefaultDevice());
    Blob<float> C_result(C_shape, Caffe::GetDefaultDevice());

    Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> B_quant(B_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> C_quant(C_shape, Caffe::GetDefaultDevice());

    Blob<float> C_unquant(C_shape, Caffe::GetDefaultDevice());


    caffe_rng_gaussian(M * K, (float)0.0, (float)0.5,
                       A.mutable_cpu_data());
    caffe_rng_gaussian(K * N, (float)0.0, (float)0.5,
                       B.mutable_cpu_data());
    caffe_rng_gaussian(M * N, (float)0.0, (float)0.5,
                       C.mutable_cpu_data());

    caffe_copy(M * N, C.cpu_data(), C_result.mutable_cpu_data());

    QuantizerParameter qpm_a;
    QuantizerParameter qpm_b;
    QuantizerParameter qpm_c;
    QuantizerParameter qpm_alpha;
    QuantizerParameter qpm_beta;
    qpm_a.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_b.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_c.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_beta.set_mode(CAFFE_QUANT_OBSERVE);

    Quantizer<float, Dtype> aq(qpm_a);
    Quantizer<float, Dtype> bq(qpm_b);
    Quantizer<float, Dtype> cq(qpm_c);
    Quantizer<float, Dtype> alphaq(qpm_alpha);
    Quantizer<float, Dtype> betaq(qpm_beta);

    // Normal GEMM
    caffe_gemm<float>(
                trans_A, trans_B,
                M, N, K,
                alpha_val,
                A.cpu_data(), B.cpu_data(),
                beta_val,
                C_result.mutable_cpu_data());


    // Observe all values that will be relevant for quantization
    aq.ObserveIn_cpu(M * K, A.cpu_data());
    bq.ObserveIn_cpu(K * N, B.cpu_data());
    cq.ObserveIn_cpu(M * N, C.cpu_data());
    cq.ObserveIn_cpu(M * N, C_result.cpu_data());
    alphaq.ObserveIn_cpu(1, &alpha_val);
    betaq.ObserveIn_cpu(1, &beta_val);

    // Apply observed values to the quantizer
    aq.update();
    bq.update();
    cq.update();
    alphaq.update();
    betaq.update();

    // Quantize A, B and C
    aq.Forward_cpu(M * K, A.cpu_data(), A_quant.mutable_cpu_data());
    bq.Forward_cpu(K * N, B.cpu_data(), B_quant.mutable_cpu_data());
    cq.Forward_cpu(M * N, C.cpu_data(), C_quant.mutable_cpu_data());

    Dtype alpha_val_quant = has_alpha;
    Dtype beta_val_quant = has_beta;

    // Quantize alpha
    if (alpha_with_quant) {
      alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant);
    }

    // Quantize beta
    if (beta_with_quant) {
      betaq.Forward_cpu(1, &beta_val, &beta_val_quant);
    }

    /*
    std::cout << "C max:" << cq.in_quantizer_values().max << std::endl;
    std::cout << "C min:" << cq.in_quantizer_values().min << std::endl;
    std::cout << "C zero:" << cq.in_quantizer_values().zero << std::endl;
    std::cout << "C scale:" << cq.in_quantizer_values().scale << std::endl;
    std::cout << "C max:" << cq.out_quantizer_values().max << std::endl;
    std::cout << "C min:" << cq.out_quantizer_values().min << std::endl;
    std::cout << "C zero:" << cq.out_quantizer_values().zero << std::endl;
    std::cout << "C scale:" <<  cq.out_quantizer_values().scale << std::endl;
    */

    if (Caffe::mode() == Caffe::Brew::CPU) {
      caffe_gemm<Dtype>(
                  trans_A, trans_B,
                  M, N, K,
                  alpha_val_quant,
                  A_quant.cpu_data(), B_quant.cpu_data(),
                  beta_val_quant,
                  C_quant.mutable_cpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(bq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(cq.out_quantizer_values()));
    } else {
      Caffe::GetDefaultDevice()->template gemm<Dtype>(trans_A, trans_B,
                  M, N, K,
                  alpha_val_quant,
                  A_quant.gpu_data(), B_quant.gpu_data(),
                  beta_val_quant,
                  C_quant.mutable_gpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(bq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(cq.out_quantizer_values()));
    }

    cq.Backward_cpu(M * N, C_quant.cpu_data(), C_unquant.mutable_cpu_data());

    // print_matrix(A_quant.cpu_data(), M, K);
    // print_matrix(B_quant.cpu_data(), K, N);

    // print_matrix(C_quant.cpu_data(), M, N);
    // print_matrix(C_result.cpu_data(), M, N);
    // print_matrix(C_unquant.cpu_data(), M, N);

    const QuantizerValues cqv = cq.in_quantizer_values();
    float eps = std::max(std::abs(cqv.get_max<float>()),
                         std::abs(cqv.get_min<float>())) * percentile_eps;

    for (int_tp i = 0; i < M * N; ++i) {
      EXPECT_NEAR(C_unquant.cpu_data()[i], C_result.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(C_unquant.cpu_data()[i] - C_result.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}
示例#10
0
TYPED_TEST(QuantBlasTest, TestGemvComparativeFloatQuant) {
  typedef typename TypeParam::Dtype Dtype;

  // Expect at most 5% error
  float percentile_eps = 0.05;

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  // Need to test > 64 dimension
  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);
  std::uniform_real_distribution<float> valRand(-2.0, 2.0);


  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp M = dimsRand(rngen);
    int_tp N = dimsRand(rngen);

    CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans;

    bool has_alpha = boolRand(rngen);
    bool has_beta = has_alpha ? boolRand(rngen) : true;

    bool alpha_with_quant = boolRand(rngen) && has_alpha;
    bool beta_with_quant = boolRand(rngen) && has_beta;

    float alpha_val;
    float beta_val;

    if (has_alpha) {
      alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0);
    } else {
      alpha_val = 0.0;
    }

    if (has_beta) {
      beta_val = beta_with_quant ? valRand(rngen) : float(1.0);
    } else {
      beta_val = 0.0;
    }

    vector<int_tp> A_shape(4, 1);
    vector<int_tp> x_shape(4, 1);
    vector<int_tp> y_shape(4, 1);

    A_shape[2] = M;
    A_shape[3] = N;
    x_shape[3] = trans_A == CblasTrans ? M : N;
    y_shape[3] = trans_A == CblasTrans ? N : M;

    Blob<float> A(A_shape, Caffe::GetDefaultDevice());
    Blob<float> x(x_shape, Caffe::GetDefaultDevice());
    Blob<float> y(y_shape, Caffe::GetDefaultDevice());
    Blob<float> y_result(y_shape, Caffe::GetDefaultDevice());

    Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> x_quant(x_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> y_quant(y_shape, Caffe::GetDefaultDevice());

    Blob<float> y_unquant(y_shape, Caffe::GetDefaultDevice());


    caffe_rng_gaussian(M * N, (float)0.0, (float)0.5,
                       A.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? M : N,
                       (float)0.0, (float)0.5, x.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? N : M,
                       (float)0.0, (float)0.5, y.mutable_cpu_data());

    caffe_copy(trans_A == CblasTrans ? N : M,
               y.cpu_data(), y_result.mutable_cpu_data());

    QuantizerParameter qpm_a;
    QuantizerParameter qpm_x;
    QuantizerParameter qpm_y;
    QuantizerParameter qpm_alpha;
    QuantizerParameter qpm_beta;
    qpm_a.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_x.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_y.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_beta.set_mode(CAFFE_QUANT_OBSERVE);

    Quantizer<float, Dtype> aq(qpm_a);
    Quantizer<float, Dtype> xq(qpm_x);
    Quantizer<float, Dtype> yq(qpm_y);
    Quantizer<float, Dtype> alphaq(qpm_alpha);
    Quantizer<float, Dtype> betaq(qpm_beta);

    // Normal GEMM
    caffe_gemv<float>(
                trans_A,
                M, N,
                alpha_val,
                A.cpu_data(), x.cpu_data(),
                beta_val,
                y_result.mutable_cpu_data());


    // Observe all values that will be relevant for quantization
    aq.ObserveIn_cpu(M * N, A.cpu_data());
    xq.ObserveIn_cpu(trans_A == CblasTrans ? M : N, x.cpu_data());
    yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y.cpu_data());
    yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y_result.cpu_data());
    alphaq.ObserveIn_cpu(1, &alpha_val);
    betaq.ObserveIn_cpu(1, &beta_val);

    // Apply observed values to the quantizer
    aq.update();
    xq.update();
    yq.update();
    alphaq.update();
    betaq.update();

    // Quantize A, B and C
    aq.Forward_cpu(M * N, A.cpu_data(), A_quant.mutable_cpu_data());
    xq.Forward_cpu(trans_A == CblasTrans ? M : N,
                   x.cpu_data(), x_quant.mutable_cpu_data());
    yq.Forward_cpu(trans_A == CblasTrans ? N : M,
                   y.cpu_data(), y_quant.mutable_cpu_data());

    Dtype alpha_val_quant = has_alpha;
    Dtype beta_val_quant = has_beta;

    // Quantize alpha
    if (alpha_with_quant) {
      alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant);
    }

    // Quantize beta
    if (beta_with_quant) {
      betaq.Forward_cpu(1, &beta_val, &beta_val_quant);
    }

    if (Caffe::mode() == Caffe::Brew::CPU) {
      caffe_gemv<Dtype>(trans_A, M, N,
                  alpha_val_quant,
                  A_quant.cpu_data(), x_quant.cpu_data(),
                  beta_val_quant,
                  y_quant.mutable_cpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));
    } else {
      Caffe::GetDefaultDevice()->template gemv<Dtype>(trans_A, M, N,
                  alpha_val_quant,
                  A_quant.gpu_data(), x_quant.gpu_data(),
                  beta_val_quant,
                  y_quant.mutable_gpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));
    }

    yq.Backward_cpu(trans_A == CblasTrans ? N : M,
                    y_quant.cpu_data(), y_unquant.mutable_cpu_data());

    // print_matrix(A_quant.cpu_data(), M, K);
    // print_matrix(B_quant.cpu_data(), K, N);

    // print_matrix(C_quant.cpu_data(), M, N);
    // print_matrix(C_result.cpu_data(), M, N);
    // print_matrix(C_unquant.cpu_data(), M, N);

    const QuantizerValues cqv = yq.in_quantizer_values();
    float eps = std::max(std::abs(cqv.get_max<float>()),
                         std::abs(cqv.get_min<float>())) * percentile_eps;

    for (int_tp i = 0; i < (trans_A == CblasTrans ? N : M); ++i) {
      EXPECT_NEAR(y_unquant.cpu_data()[i], y_result.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(y_unquant.cpu_data()[i] - y_result.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}