/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/ { assert(inputIndex == 0), inputIndex; // get the args size_t rank = DetermineElementwiseTensorRank(); auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one... auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, fr); // ...to this one // gradients are not as simple as passing an op-code, unfortunately switch (m_reductionOp) { case ElementWiseOperator::opSum: // "Sum": broadcast the gradient // "Mean": same as "Sum" with scaling by 1/#dims sliceInputGrad.AddCopyOf(sliceOutputGrad, m_scale); break; case ElementWiseOperator::opLogSum: { auto input = InputRef(inputIndex).ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // Let: f(x, y, z) = log(exp x + exp y + exp z) // For the derivative we get: // df / dx = exp(x)/exp(f) // = exp(x – f) sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output); } break; case ElementWiseOperator::opMin: case ElementWiseOperator::opMax: auto input = InputRef(inputIndex).ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // POTENTIAL PROBLEM: // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points. // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time. // In these cases there is no correct gradient.The question is if this could lead to any problems. // Let's look at two scenarios where this might happen: // // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value. // In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway. // // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data. // // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution. // Also note that for Clip , Min, Max and ReLU we have the same kind of problem. sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad); break; // more coming } }
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/ { assert(inputIndex == 0), inputIndex; // get the args size_t rank = DetermineElementwiseTensorRank(); auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one... auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one // gradients are not as simple as passing an op-code, unfortunately switch (m_reductionOp) { case ElementWiseOperator::opSum: // "Sum": broadcast the gradient sliceInputGrad.AddCopyOf(sliceOutputGrad); break; case ElementWiseOperator::opMax: case ElementWiseOperator::opMin: auto input = Input(inputIndex)->ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // POTENTIAL PROBLEM: // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points. // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time. // In these cases there is no correct gradient.The question is if this could lead to any problems. // Let's look at two scenarios where this might happen: // // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value. // In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway. // // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data. // // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution. // Also note that for Clip , Min, Max and ReLU we have the same kind of problem. sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad); break; // more coming // "LogPlus": softmax // f(x) = log(sum_i exp x_i), hence gradient is: // df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i - ReduceLogPlus(x)) // targetGradient = gradientFromTop .* Exp (inputValue - outputValue) --TODO: verify // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary. } }