void backward_batchnorm_layer_gpu(const layer l, network_state state) { #ifdef CUDNN float one = 1; float zero = 0; cudnnBatchNormalizationBackward(cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, &one, &zero, &one, &one, l.dstTensorDesc, l.x_gpu, l.dstTensorDesc, l.delta_gpu, l.dstTensorDesc, l.x_norm_gpu, l.normTensorDesc, l.scales_gpu, l.scale_updates_gpu, l.bias_updates_gpu, .00001, l.mean_gpu, l.variance_gpu); copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); #else backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu); fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); #endif if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); }
// NB: CuDNN only implements the backward algorithm for batchnorm // in training mode (evaluation mode batchnorm has a different algorithm), // which is why this doesn't accept a 'training' parameter. std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward( const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t, // Unused: but we require them to be passed so that double backwards // has access const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean_t, const Tensor& save_var_t, double epsilon) { TensorArg input{ input_t, "input", 1 }, grad_output{ grad_output_t, "grad_output", 2 }, weight{ weight_t, "weight", 3 }, save_mean{ save_mean_t, "save_mean", 4 }, save_var{ save_var_t, "save_var", 5 }; CheckedFrom c = "cudnn_batch_norm_backward"; setCuDNNStreamToCurrent(); checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var}); if (input->type().scalarType() == ScalarType::Half) { checkScalarType(c, weight, ScalarType::Float); } else { checkAllSameType(c, {input, weight}); } checkAllSameType(c, {input, grad_output}); checkAllSameType(c, {weight, save_mean, save_var}); // TODO: is weight required to be contiguous? checkAllContiguous(c, {input, grad_output, save_mean, save_var}); checkDimRange(c, input, 2, 6 /* exclusive */); checkSameSize(c, input, grad_output); auto num_features = input->size(1); for (auto t : {weight, save_mean, save_var}) { checkNumel(c, t, num_features); } cudnnBatchNormMode_t mode; if (input->dim() == 2) { mode = CUDNN_BATCHNORM_PER_ACTIVATION; } else { #if CUDNN_VERSION >= 7003 mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else mode = CUDNN_BATCHNORM_SPATIAL; #endif } auto grad_input_t = input->type().tensor(input->sizes()); auto grad_weight_t = weight->type().tensor(weight->sizes()); auto grad_bias_t = weight->type().tensor(weight->sizes()); auto handle = getCudnnHandle(); auto dataType = getCudnnDataType(*input); TensorDescriptor idesc{ *input, 4 }; // input, output, grad_output descriptor TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, save_mean, etc. Constant one(dataType, 1); Constant zero(dataType, 0); CUDNN_CHECK(cudnnBatchNormalizationBackward( handle, mode, &one, &zero, &one, &zero, idesc.desc(), input->data_ptr(), idesc.desc(), grad_output->data_ptr(), idesc.desc(), grad_input_t.data_ptr(), wdesc.desc(), weight->data_ptr(), grad_weight_t.data_ptr(), grad_bias_t.data_ptr(), epsilon, save_mean->data_ptr(), save_var->data_ptr())); return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t}; }