示例#1
0
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0), inputIndex;

    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
    switch (m_op)
    {
    case ElementWiseOperator::opSum:
        // "Plus": broadcast the gradient
        sliceInputGrad.AddCopyOf(sliceOutputGrad);
        break;

        // more coming

        // "LogPlus": softmax
        //   f(x) = log(sum_i exp x_i), hence gradient is:
        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  – ReduceLogPlus(x))
        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.

        // "Max": Copy the gradient only to the max value. targetGradient += gradientFromTop .* (outputValue == inputValue). Needs its own opcode. --TODO : verify
    }
}
示例#2
0
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0), inputIndex;

    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =             GradientTensorFor(rank, fr); // propagate from this one...
    auto sliceInputGrad  = InputRef(0).GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
        // "Sum":  broadcast the gradient
        // "Mean": same as "Sum" with scaling by 1/#dims
        sliceInputGrad.AddCopyOf(sliceOutputGrad, m_scale);
        break;

    case ElementWiseOperator::opLogSum:
        {
            auto input = InputRef(inputIndex).ValueTensorFor(rank, fr);
            auto output = ValueTensorFor(rank, fr.AllowBroadcast());
            // Let: f(x, y, z) = log(exp x + exp y + exp z)
            // For the derivative we get:
            // df / dx = exp(x)/exp(f)
            //         = exp(x – f)
            sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
    }
        break;

    case ElementWiseOperator::opMin:
    case ElementWiseOperator::opMax:
        auto input = InputRef(inputIndex).ValueTensorFor(rank, fr);
        auto output = ValueTensorFor(rank, fr.AllowBroadcast());

        // POTENTIAL PROBLEM:
        // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
        // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
        // In these cases there is no correct gradient.The question is if this could lead to any problems.
        // Let's look at two scenarios where this might happen:
        //
        // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
        //   In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
        //
        // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
        //
        // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
        // Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
        sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
        break;

        // more coming
    }
}
示例#3
0
/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0); inputIndex;

    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =             GradientTensorFor(rank, fr);      // propagate from this one...
    auto sliceInputGrad  = InputRef(0).GradientTensorFor(rank, fr);      // ...to this one

    sliceInputGrad.AddCopyOf(sliceOutputGrad);

    // do the tracing
    if (m_logGradientToo)
        Log(fr, true/*means log gradient*/);
}
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0), inputIndex;

    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
        // "Sum": broadcast the gradient
        sliceInputGrad.AddCopyOf(sliceOutputGrad);
        break;

    case ElementWiseOperator::opMax:
    case ElementWiseOperator::opMin:
        auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
        auto output = ValueTensorFor(rank, fr.AllowBroadcast());

        // POTENTIAL PROBLEM:
        // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
        // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
        // In these cases there is no correct gradient.The question is if this could lead to any problems.
        // Let's look at two scenarios where this might happen:
        //
        // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
        //   In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
        //
        // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
        //
        // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
        // Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
        sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
        break;

        // more coming

        // "LogPlus": softmax
        //   f(x) = log(sum_i exp x_i), hence gradient is:
        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  - ReduceLogPlus(x))
        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
    }
}