int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale, PyGpuArrayObject *bias, PyGpuArrayObject **outp, PyGpuArrayObject **x_mean, PyGpuArrayObject **x_invstd, PyGpuContextObject *c) { if (c_set_tensorNd(inp, bn_input) != 0) return 1; if (c_set_tensorNd(scale, bn_params) != 0) return 1; if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (c_set_tensorNd(*outp, bn_output) != 0) return 1; { const float falpha = 1.; const float fbeta = 0.; const double dalpha = 1.; const double dbeta = 0.; void *alpha; void *beta; if (inp->ga.typecode == GA_DOUBLE) { alpha = (void *)&dalpha; beta = (void *)&dbeta; } else { alpha = (void *)&falpha; beta = (void *)&fbeta; } cudnnStatus_t err = cudnnBatchNormalizationForwardTraining( APPLY_SPECIFIC(_handle), MODE, alpha, beta, bn_input, PyGpuArray_DEV_DATA(inp), bn_output, PyGpuArray_DEV_DATA(*outp), bn_params, PyGpuArray_DEV_DATA(scale), PyGpuArray_DEV_DATA(bias), 0, NULL, // running mean, deliberately unused NULL, // running var, deliberately unused EPSILON, PyGpuArray_DEV_DATA(*x_mean), PyGpuArray_DEV_DATA(*x_invstd) ); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "Error during batchnorm: %s\n", cudnnGetErrorString(err)); return 1; } } return 0; }
void forward_batchnorm_layer_gpu(layer l, network_state state) { if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } if (state.train) { #ifdef CUDNN copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); float one = 1; float zero = 0; cudnnBatchNormalizationForwardTraining(cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, &one, &zero, l.dstTensorDesc, l.x_gpu, l.dstTensorDesc, l.output_gpu, l.normTensorDesc, l.scales_gpu, l.biases_gpu, .01, l.rolling_mean_gpu, l.rolling_variance_gpu, .00001, l.mean_gpu, l.variance_gpu); #else fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1); axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1); axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); #endif } else { normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); } }
std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm( const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, const Tensor& running_mean_t, const Tensor& running_var_t, bool training, double exponential_average_factor, double epsilon) { TensorArg input{ input_t, "input", 1 }, weight{ weight_t, "weight", 2 }, bias{ bias_t, "bias", 3 }, running_mean{ running_mean_t, "running_mean", 4 }, running_var{ running_var_t, "running_var", 5 }; CheckedFrom c = "cudnn_batch_norm"; setCuDNNStreamToCurrent(); checkAllDefined(c, {input, weight, bias}); if (!training) { checkAllDefined(c, {running_mean, running_var}); } checkAllSameGPU(c, {input, weight, bias, running_mean, running_var}); if (input->type().scalarType() == ScalarType::Half) { checkScalarType(c, weight, ScalarType::Float); } else { checkAllSameType(c, {input, weight}); } checkAllSameType(c, {weight, bias, running_mean, running_var}); // TODO: is weight required to be contiguous? checkAllContiguous(c, {input, weight, bias, running_mean, running_var}); checkDimRange(c, input, 2, 6 /* exclusive */); auto num_features = input->size(1); for (auto t : {weight, bias, running_mean, running_var}) { if (t->defined()) { checkNumel(c, t, num_features); } } cudnnBatchNormMode_t mode; if (input->dim() == 2) { mode = CUDNN_BATCHNORM_PER_ACTIVATION; } else { mode = CUDNN_BATCHNORM_SPATIAL; #if CUDNN_VERSION >= 7003 if(training) mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #endif } auto output_t = input->type().tensor(input->sizes()); TensorArg output{ output_t, "output", 0 }; auto handle = getCudnnHandle(); auto dataType = getCudnnDataType(*input); TensorDescriptor idesc{ *input, 4 }; // input descriptor TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, running_mean, etc. Constant one(dataType, 1); Constant zero(dataType, 0); Tensor save_mean, save_var; if (training) { int64_t num_features = input_t.size(1); save_mean = weight_t.type().tensor({ num_features }); save_var = weight_t.type().tensor({ num_features }); CUDNN_CHECK(cudnnBatchNormalizationForwardTraining( handle, mode, &one, &zero, idesc.desc(), input->data_ptr(), idesc.desc(), output->data_ptr(), wdesc.desc(), weight->data_ptr(), bias->data_ptr(), exponential_average_factor, at::maybe_data_ptr(running_mean), at::maybe_data_ptr(running_var), epsilon, save_mean.data_ptr(), save_var.data_ptr())); } else { CUDNN_CHECK(cudnnBatchNormalizationForwardInference( handle, mode, &one, &zero, idesc.desc(), input->data_ptr(), idesc.desc(), output->data_ptr(), wdesc.desc(), weight->data_ptr(), bias->data_ptr(), running_mean->data_ptr(), running_var->data_ptr(), epsilon)); } // save_mean and save_var can be undefined // If this causes problems, we can initialize them to empty tensors // of the correct type return std::tuple<Tensor, Tensor, Tensor>{output_t, save_mean, save_var}; }