Ejemplo n.º 1
0
int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (bias_term)
    {
        const float* scale_ptr = scale_data;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            float32x4_t _bias = vdupq_n_f32(bias);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_bias, _p, _s);
                vst1q_f32(outptr, _p);

                ptr += 4;
                outptr += 4;
            }
#endif // __ARM_NEON

            for (; remain>0; remain--)
            {
                *outptr = *ptr * s + bias;

                ptr++;
                outptr++;
            }
        }
    }
    else
    {
Ejemplo n.º 2
0
int Dropout::forward_inplace(Mat& bottom_top_blob) const
{
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            ptr[i] = ptr[i] * scale;
        }
    }

    return 0;
}
Ejemplo n.º 3
0
int Log::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (base == -1.f)
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = log(shift + ptr[i] * scale);
            }
        }
    }
    else
    {
        float log_base_inv = 1.f / log(base);

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = log(shift + ptr[i] * scale) * log_base_inv;
            }
        }
    }

    return 0;
}
Ejemplo n.º 4
0
int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
#endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr = *ptr + bias;

            ptr++;
        }
    }

    return 0;
}
Ejemplo n.º 5
0
int LRN::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = square_blob.channel(q);

        for (int i=0; i<size; i++)
        {
            outptr[i] = ptr[i] * ptr[i];
        }
    }

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        top_blob.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            // square sum
            float* outptr = top_blob.channel(q);
            for (int p=q - local_size / 2; p<=q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                for (int i=0; i<size; i++)
                {
                    outptr[i] += sptr[i];
                }
            }

            const float* ptr = bottom_blob.channel(q);
            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] * pow(1.f + alpha_div_size * outptr[i], -beta);
            }
        }
    }
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        int outw = w;
        int outh = h;

        Mat square_blob_bordered = square_blob;
        int pad = local_size / 2;
        if (pad > 0)
        {
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f);
            if (square_blob_bordered.empty())
                return -100;

            w = square_blob_bordered.w;
            h = square_blob_bordered.h;
        }

        const int maxk = local_size * local_size;

        const float alpha_div_size = alpha / maxk;

        // norm window offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - local_size;
            for (int i = 0; i < local_size; i++)
            {
                for (int j = 0; j < local_size; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const Mat m = square_blob_bordered.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    const float* sptr = m.row(i) + j;

                    float ss = 0.f;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[ space_ofs[k] ];
                        ss += val;
                    }

                    outptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta);
                }

                ptr += outw;
                outptr += outw;
            }
        }
    }

    return 0;
}
Ejemplo n.º 6
0
int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    // deconvolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    if (channels % group != 0 || num_output % group != 0)
    {
        // reject invalid group
        return -100;
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w;
    int outh = (h - 1) * stride_h + kernel_extent_h;

    Mat top_blob_bordered = top_blob;
    top_blob_bordered.create(outw, outh, num_output);
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = outw * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // depth-wise
    if (channels == group && group == num_output)
    {
        #pragma omp parallel for
        for (int g=0; g<group; g++)
        {
            const float* inptr = bottom_blob.channel(g);
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat m = top_blob_bordered.channel(g);

            const float bias = bias_term ? bias_data[g] : 0.f;

            m.fill(bias);

            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < w; j++)
                {
                    float* outptr = m.row(i*stride_h) + j*stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = inptr[i*w + j];
                        float w = kptr[k];
                        outptr[ space_ofs[k] ] += val * w;
                    }
                }
            }
        }
    }
    else
    {
        // num_output
        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

        #pragma omp parallel for
        for (int g = 0; g < group; g++)
        {
            const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
            for (int p = 0; p < num_output_g; p++)
            {
                Mat out = top_blob_bordered.channel(g * num_output_g + p);

                const float bias = bias_term ? bias_data[g * num_output_g + p] : 0.f;

                out.fill(bias);

                for (int i = 0; i < h; i++)
                {
                    for (int j = 0; j < w; j++)
                    {
                        float* outptr = out.row(i*stride_h) + j*stride_w;

                        const float* kptr = weight_data_ptr + maxk * channels_g * p;

                        // channels_g
                        for (int q = 0; q < channels_g; q++)
                        {
                            const Mat m = bottom_blob.channel(channels_g * g + q);
                            float val = *(m.row(i) + j);

                            for (int k = 0; k < maxk; k++)
                            {
                                outptr[ space_ofs[k] ] += val * kptr[k];
                            }

                            kptr += maxk;
                        }
                    }
                }
            }
        }
    }

    top_blob = top_blob_bordered;

    if (pad_w > 0 || pad_h > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w);
        if (top_blob.empty())
            return -100;

        outw = top_blob.w;
        outh = top_blob.h;
    }

    return 0;
}
Ejemplo n.º 7
0
int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vmulq_f32(_p, _p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
#endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = *ptr * *ptr;

            ptr++;
            outptr++;
        }
    }

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            // square sum
            for (int p=q - local_size / 2; p<=q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                float* ssptr = square_sum.channel(q);

#if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
#else
                int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
                for (; nn>0; nn--)
                {
                    float32x4_t _sp = vld1q_f32(sptr);
                    float32x4_t _ssp = vld1q_f32(ssptr);
                    _ssp = vaddq_f32(_ssp, _sp);
                    vst1q_f32(ssptr, _ssp);

                    sptr += 4;
                    ssptr += 4;
                }
#endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *ssptr += *sptr;
                    sptr++;
                    ssptr++;
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            float* ssptr = square_sum.channel(q);

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _bias = vdupq_n_f32(bias);
            float32x4_t _ads = vdupq_n_f32(alpha_div_size);
            float32x4_t _mb = vdupq_n_f32(-beta);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ssp = vld1q_f32(ssptr);
                _ssp = vmulq_f32(_ssp, _ads);
                _ssp = vaddq_f32(_ssp, _bias);
                _ssp = pow_ps(_ssp, _mb);
                _p = vmulq_f32(_p, _ssp);
                vst1q_f32(ptr, _p);

                ssptr += 4;
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta);

                ssptr++;
                ptr++;
            }
        }
    }
Ejemplo n.º 8
0
int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    // order_type
    // 0 = w h c
    // 1 = h w c
    // 2 = w c h
    // 3 = c w h
    // 4 = h c w
    // 5 = c h w

    if (order_type == 0)
    {
        top_blob = bottom_blob;
    }
    else if (order_type == 1)
    {
        top_blob.create(h, w, channels);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < w; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    outptr[i*h + j] = ptr[j*w + i];
                }
            }
        }
    }
    else if (order_type == 2)
    {
        top_blob.create(w, channels, h);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<h; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < channels; i++)
            {
                const float* ptr = bottom_blob.channel(i).row(q);

                for (int j = 0; j < w; j++)
                {
                    outptr[i*w + j] = ptr[j];
                }
            }
        }
    }
    else if (order_type == 3)
    {
        top_blob.create(channels, w, h);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<h; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < w; i++)
            {
                for (int j = 0; j < channels; j++)
                {
                    const float* ptr = bottom_blob.channel(j).row(q);

                    outptr[i*channels + j] = ptr[i];
                }
            }
        }
    }
    else if (order_type == 4)
    {
        top_blob.create(h, channels, w);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<w; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < channels; i++)
            {
                const float* ptr = bottom_blob.channel(i);

                for (int j = 0; j < h; j++)
                {
                    outptr[i*channels + j] = ptr[j*w + q];
                }
            }
        }
    }
    else if (order_type == 5)
    {
        top_blob.create(channels, h, w);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<w; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < channels; j++)
                {
                    const float* ptr = bottom_blob.channel(j);

                    outptr[i*channels + j] = ptr[i*w + q];
                }
            }
        }
    }

    return 0;
}
Ejemplo n.º 9
0
int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int size = w * h;

    top_blob.create(w, h, channels);
    if (top_blob.empty())
        return -100;

    if (across_spatial && across_channel)
    {
        // square
        Mat square_sum_blob;
        square_sum_blob.create(channels);
        if (square_sum_blob.empty())
            return -100;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);

            float ssum = 0.f;
            for (int i=0; i<size; i++)
            {
                ssum += ptr[i] * ptr[i];
            }

            square_sum_blob[q] = ssum;
        }

        // sum + eps
        float ssum = eps;
        for (int q=0; q<channels; q++)
        {
            ssum += square_sum_blob[q];
        }

        // 1 / sqrt(ssum)
        float a = 1.f / sqrt(ssum);

        if (channel_shared)
        {
            float scale = a * scale_data[0];

            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * scale;
                }
            }
        }
        else
        {
            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = a * scale_data[q];

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * scale;
                }
            }
        }

        return 0;
    }

    if (across_spatial && !across_channel)
    {
        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            float ssum = eps;
            for (int i=0; i<size; i++)
            {
                ssum += ptr[i] * ptr[i];
            }

            float a = 1.f / sqrt(ssum);
            float scale = a * (channel_shared ? scale_data[0] : scale_data[q]);

            for (int i=0; i<size; i++)
            {
                outptr[i] = ptr[i] * scale;
            }
        }

        return 0;
    }

    if (!across_spatial && across_channel)
    {
        // square sum, 1 / sqrt(ssum)
        Mat square_sum_blob;
        square_sum_blob.create(size);
        if (square_sum_blob.empty())
            return -100;

        if (channel_shared)
        {
            float scale = scale_data[0];

            #pragma omp parallel for
            for (int i=0; i<size; i++)
            {
                float ssum = eps;
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);
                    ssum += ptr[i] * ptr[i];
                }

                square_sum_blob[i] = 1.f / sqrt(ssum) * scale;
            }

            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * square_sum_blob[i];
                }
            }
        }
        else
        {
            #pragma omp parallel for
            for (int i=0; i<size; i++)
            {
                float ssum = eps;
                for (int q=0; q<channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);
                    ssum += ptr[i] * ptr[i];
                }

                square_sum_blob[i] = 1.f / sqrt(ssum);
            }

            #pragma omp parallel for
            for (int q=0; q<channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                float scale = scale_data[q];

                for (int i=0; i<size; i++)
                {
                    outptr[i] = ptr[i] * square_sum_blob[i] * scale;
                }
            }
        }

        return 0;
    }

    return 0;
}