Exemplo n.º 1
0
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0), inputIndex;

    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =             GradientTensorFor(rank, fr); // propagate from this one...
    auto sliceInputGrad  = InputRef(0).GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
        // "Sum":  broadcast the gradient
        // "Mean": same as "Sum" with scaling by 1/#dims
        sliceInputGrad.AddCopyOf(sliceOutputGrad, m_scale);
        break;

    case ElementWiseOperator::opLogSum:
        {
            auto input = InputRef(inputIndex).ValueTensorFor(rank, fr);
            auto output = ValueTensorFor(rank, fr.AllowBroadcast());
            // Let: f(x, y, z) = log(exp x + exp y + exp z)
            // For the derivative we get:
            // df / dx = exp(x)/exp(f)
            //         = exp(x – f)
            sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
    }
        break;

    case ElementWiseOperator::opMin:
    case ElementWiseOperator::opMax:
        auto input = InputRef(inputIndex).ValueTensorFor(rank, fr);
        auto output = ValueTensorFor(rank, fr.AllowBroadcast());

        // POTENTIAL PROBLEM:
        // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
        // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
        // In these cases there is no correct gradient.The question is if this could lead to any problems.
        // Let's look at two scenarios where this might happen:
        //
        // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
        //   In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
        //
        // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
        //
        // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
        // Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
        sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
        break;

        // more coming
    }
}
Exemplo n.º 2
0
/*virtual*/ void TraceNode<ElemType>::ForwardProp(const FrameRange& fr) /*override*/
{
    size_t rank = DetermineElementwiseTensorRank();
    auto result =             ValueTensorFor(rank, fr);
    auto input  = InputRef(0).ValueTensorFor(rank, fr);
    result.AssignCopyOf(input);

    // do the tracing
    Log(fr, false/*means log value*/);
}
/*virtual*/ void ReduceElementsNode<ElemType>::ForwardProp(const FrameRange& fr) /*override*/
{
    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto result =           ValueTensorFor(rank, fr);
    auto input  = Input(0)->ValueTensorFor(rank, fr);

    // the actual operation is a Copy with reduction, where the magic is in the reduction op
    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_reductionOp);
    // note: we can implement "Mean" by passing 1/dim for alpha
}
Exemplo n.º 4
0
/*virtual*/ void ReduceElementsNode<ElemType>::ForwardProp(const FrameRange& fr) /*override*/
{
    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto result =             ValueTensorFor(rank, fr);
    auto input  = InputRef(0).ValueTensorFor(rank, fr);

    // the actual operation is a Copy with reduction, where the magic is in the reduction op
    // For "Mean", m_scale is 1/#elements, and 1 otherwise.
    result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
}
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
{
    assert(inputIndex == 0), inputIndex;

    // get the args
    size_t rank = DetermineElementwiseTensorRank();
    auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
        // "Sum": broadcast the gradient
        sliceInputGrad.AddCopyOf(sliceOutputGrad);
        break;

    case ElementWiseOperator::opMax:
    case ElementWiseOperator::opMin:
        auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
        auto output = ValueTensorFor(rank, fr.AllowBroadcast());

        // POTENTIAL PROBLEM:
        // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
        // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
        // In these cases there is no correct gradient.The question is if this could lead to any problems.
        // Let's look at two scenarios where this might happen:
        //
        // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
        //   In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
        //
        // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
        //
        // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
        // Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
        sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
        break;

        // more coming

        // "LogPlus": softmax
        //   f(x) = log(sum_i exp x_i), hence gradient is:
        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  - ReduceLogPlus(x))
        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
    }
}