int Scale_arm::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; if (bias_term) { const float* scale_ptr = scale_data; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float s = scale_ptr[q]; float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _s = vdupq_n_f32(s); float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); _p = vmlaq_f32(_bias, _p, _s); vst1q_f32(outptr, _p); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr * s + bias; ptr++; outptr++; } } } else {
int Dropout::forward_inplace(Mat& bottom_top_blob) const { if (scale == 1.f) { return 0; } int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); for (int i=0; i<size; i++) { ptr[i] = ptr[i] * scale; } } return 0; }
int Log::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; if (base == -1.f) { #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); for (int i=0; i<size; i++) { ptr[i] = log(shift + ptr[i] * scale); } } } else { float log_base_inv = 1.f / log(base); #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); for (int i=0; i<size; i++) { ptr[i] = log(shift + ptr[i] * scale) * log_base_inv; } } } return 0; }
int Bias_arm::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { float* ptr = bottom_top_blob.channel(q); float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vaddq_f32(_p, _bias); vst1q_f32(ptr, _outp); ptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ptr = *ptr + bias; ptr++; } } return 0; }
int LRN::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; // squared values with local_size padding Mat square_blob; square_blob.create(w, h, channels); if (square_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = square_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * ptr[i]; } } if (region_type == NormRegion_ACROSS_CHANNELS) { top_blob.fill(0.f); const float alpha_div_size = alpha / local_size; #pragma omp parallel for for (int q=0; q<channels; q++) { // square sum float* outptr = top_blob.channel(q); for (int p=q - local_size / 2; p<=q + local_size / 2; p++) { if (p < 0 || p >= channels) continue; const float* sptr = square_blob.channel(p); for (int i=0; i<size; i++) { outptr[i] += sptr[i]; } } const float* ptr = bottom_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * pow(1.f + alpha_div_size * outptr[i], -beta); } } } else if (region_type == NormRegion_WITHIN_CHANNEL) { int outw = w; int outh = h; Mat square_blob_bordered = square_blob; int pad = local_size / 2; if (pad > 0) { copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); if (square_blob_bordered.empty()) return -100; w = square_blob_bordered.w; h = square_blob_bordered.h; } const int maxk = local_size * local_size; const float alpha_div_size = alpha / maxk; // norm window offsets std::vector<int> _space_ofs(maxk); int* space_ofs = &_space_ofs[0]; { int p1 = 0; int p2 = 0; int gap = w - local_size; for (int i = 0; i < local_size; i++) { for (int j = 0; j < local_size; j++) { space_ofs[p1] = p2; p1++; p2++; } p2 += gap; } } #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); const Mat m = square_blob_bordered.channel(q); float* outptr = top_blob.channel(q); for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) { const float* sptr = m.row(i) + j; float ss = 0.f; for (int k = 0; k < maxk; k++) { float val = sptr[ space_ofs[k] ]; ss += val; } outptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta); } ptr += outw; outptr += outw; } } } return 0; }
int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const { // deconvolv with NxN kernel // value = value + bias int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; if (channels % group != 0 || num_output % group != 0) { // reject invalid group return -100; } const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; Mat top_blob_bordered = top_blob; top_blob_bordered.create(outw, outh, num_output); if (top_blob_bordered.empty()) return -100; const int maxk = kernel_w * kernel_h; // kernel offsets std::vector<int> _space_ofs(maxk); int* space_ofs = &_space_ofs[0]; { int p1 = 0; int p2 = 0; int gap = outw * dilation_h - kernel_w * dilation_w; for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) { space_ofs[p1] = p2; p1++; p2 += dilation_w; } p2 += gap; } } // depth-wise if (channels == group && group == num_output) { #pragma omp parallel for for (int g=0; g<group; g++) { const float* inptr = bottom_blob.channel(g); const float* kptr = (const float*)weight_data + maxk * g; Mat m = top_blob_bordered.channel(g); const float bias = bias_term ? bias_data[g] : 0.f; m.fill(bias); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { float* outptr = m.row(i*stride_h) + j*stride_w; for (int k = 0; k < maxk; k++) { float val = inptr[i*w + j]; float w = kptr[k]; outptr[ space_ofs[k] ] += val * w; } } } } } else { // num_output const int channels_g = channels / group; const int num_output_g = num_output / group; #pragma omp parallel for for (int g = 0; g < group; g++) { const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g; for (int p = 0; p < num_output_g; p++) { Mat out = top_blob_bordered.channel(g * num_output_g + p); const float bias = bias_term ? bias_data[g * num_output_g + p] : 0.f; out.fill(bias); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { float* outptr = out.row(i*stride_h) + j*stride_w; const float* kptr = weight_data_ptr + maxk * channels_g * p; // channels_g for (int q = 0; q < channels_g; q++) { const Mat m = bottom_blob.channel(channels_g * g + q); float val = *(m.row(i) + j); for (int k = 0; k < maxk; k++) { outptr[ space_ofs[k] ] += val * kptr[k]; } kptr += maxk; } } } } } } top_blob = top_blob_bordered; if (pad_w > 0 || pad_h > 0) { copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } return 0; }
int LRN_arm::forward_inplace(Mat& bottom_top_blob) const { int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; int size = w * h; // squared values with local_size padding Mat square_blob; square_blob.create(w, h, channels); if (square_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_top_blob.channel(q); float* outptr = square_blob.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vmulq_f32(_p, _p); vst1q_f32(outptr, _outp); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr * *ptr; ptr++; outptr++; } } if (region_type == NormRegion_ACROSS_CHANNELS) { Mat square_sum; square_sum.create(w, h, channels); if (square_sum.empty()) return -100; square_sum.fill(0.f); const float alpha_div_size = alpha / local_size; #pragma omp parallel for for (int q=0; q<channels; q++) { // square sum for (int p=q - local_size / 2; p<=q + local_size / 2; p++) { if (p < 0 || p >= channels) continue; const float* sptr = square_blob.channel(p); float* ssptr = square_sum.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON for (; nn>0; nn--) { float32x4_t _sp = vld1q_f32(sptr); float32x4_t _ssp = vld1q_f32(ssptr); _ssp = vaddq_f32(_ssp, _sp); vst1q_f32(ssptr, _ssp); sptr += 4; ssptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ssptr += *sptr; sptr++; ssptr++; } } float* ptr = bottom_top_blob.channel(q); float* ssptr = square_sum.channel(q); #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); float32x4_t _ads = vdupq_n_f32(alpha_div_size); float32x4_t _mb = vdupq_n_f32(-beta); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _ssp = vld1q_f32(ssptr); _ssp = vmulq_f32(_ssp, _ads); _ssp = vaddq_f32(_ssp, _bias); _ssp = pow_ps(_ssp, _mb); _p = vmulq_f32(_p, _ssp); vst1q_f32(ptr, _p); ssptr += 4; ptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta); ssptr++; ptr++; } } }
int Permute::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; // order_type // 0 = w h c // 1 = h w c // 2 = w c h // 3 = c w h // 4 = h c w // 5 = c h w if (order_type == 0) { top_blob = bottom_blob; } else if (order_type == 1) { top_blob.create(h, w, channels); if (top_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); for (int i = 0; i < w; i++) { for (int j = 0; j < h; j++) { outptr[i*h + j] = ptr[j*w + i]; } } } } else if (order_type == 2) { top_blob.create(w, channels, h); if (top_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<h; q++) { float* outptr = top_blob.channel(q); for (int i = 0; i < channels; i++) { const float* ptr = bottom_blob.channel(i).row(q); for (int j = 0; j < w; j++) { outptr[i*w + j] = ptr[j]; } } } } else if (order_type == 3) { top_blob.create(channels, w, h); if (top_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<h; q++) { float* outptr = top_blob.channel(q); for (int i = 0; i < w; i++) { for (int j = 0; j < channels; j++) { const float* ptr = bottom_blob.channel(j).row(q); outptr[i*channels + j] = ptr[i]; } } } } else if (order_type == 4) { top_blob.create(h, channels, w); if (top_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<w; q++) { float* outptr = top_blob.channel(q); for (int i = 0; i < channels; i++) { const float* ptr = bottom_blob.channel(i); for (int j = 0; j < h; j++) { outptr[i*channels + j] = ptr[j*w + q]; } } } } else if (order_type == 5) { top_blob.create(channels, h, w); if (top_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<w; q++) { float* outptr = top_blob.channel(q); for (int i = 0; i < h; i++) { for (int j = 0; j < channels; j++) { const float* ptr = bottom_blob.channel(j); outptr[i*channels + j] = ptr[i*w + q]; } } } } return 0; }
int Normalize::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; if (across_spatial && across_channel) { // square Mat square_sum_blob; square_sum_blob.create(channels); if (square_sum_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float ssum = 0.f; for (int i=0; i<size; i++) { ssum += ptr[i] * ptr[i]; } square_sum_blob[q] = ssum; } // sum + eps float ssum = eps; for (int q=0; q<channels; q++) { ssum += square_sum_blob[q]; } // 1 / sqrt(ssum) float a = 1.f / sqrt(ssum); if (channel_shared) { float scale = a * scale_data[0]; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * scale; } } } else { #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float scale = a * scale_data[q]; for (int i=0; i<size; i++) { outptr[i] = ptr[i] * scale; } } } return 0; } if (across_spatial && !across_channel) { #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float ssum = eps; for (int i=0; i<size; i++) { ssum += ptr[i] * ptr[i]; } float a = 1.f / sqrt(ssum); float scale = a * (channel_shared ? scale_data[0] : scale_data[q]); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * scale; } } return 0; } if (!across_spatial && across_channel) { // square sum, 1 / sqrt(ssum) Mat square_sum_blob; square_sum_blob.create(size); if (square_sum_blob.empty()) return -100; if (channel_shared) { float scale = scale_data[0]; #pragma omp parallel for for (int i=0; i<size; i++) { float ssum = eps; for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); ssum += ptr[i] * ptr[i]; } square_sum_blob[i] = 1.f / sqrt(ssum) * scale; } #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * square_sum_blob[i]; } } } else { #pragma omp parallel for for (int i=0; i<size; i++) { float ssum = eps; for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); ssum += ptr[i] * ptr[i]; } square_sum_blob[i] = 1.f / sqrt(ssum); } #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float scale = scale_data[q]; for (int i=0; i<size; i++) { outptr[i] = ptr[i] * square_sum_blob[i] * scale; } } } return 0; } return 0; }