inline void op_sp_plus::apply(Mat<typename T1::elem_type>& out, const SpToDOp<T1,op_sp_plus>& in) { arma_extra_debug_sigprint(); // Note that T1 will be a sparse type, so we use SpProxy. const SpProxy<T1> proxy(in.m); out.set_size(proxy.get_n_rows(), proxy.get_n_cols()); out.fill(in.aux); typename SpProxy<T1>::const_iterator_type it = proxy.begin(); typename SpProxy<T1>::const_iterator_type it_end = proxy.end(); for(; it != it_end; ++it) { out.at(it.row(), it.col()) += (*it); } }
int LRN::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; // squared values with local_size padding Mat square_blob; square_blob.create(w, h, channels); if (square_blob.empty()) return -100; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = square_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * ptr[i]; } } if (region_type == NormRegion_ACROSS_CHANNELS) { top_blob.fill(0.f); const float alpha_div_size = alpha / local_size; #pragma omp parallel for for (int q=0; q<channels; q++) { // square sum float* outptr = top_blob.channel(q); for (int p=q - local_size / 2; p<=q + local_size / 2; p++) { if (p < 0 || p >= channels) continue; const float* sptr = square_blob.channel(p); for (int i=0; i<size; i++) { outptr[i] += sptr[i]; } } const float* ptr = bottom_blob.channel(q); for (int i=0; i<size; i++) { outptr[i] = ptr[i] * pow(1.f + alpha_div_size * outptr[i], -beta); } } } else if (region_type == NormRegion_WITHIN_CHANNEL) { int outw = w; int outh = h; Mat square_blob_bordered = square_blob; int pad = local_size / 2; if (pad > 0) { copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f); if (square_blob_bordered.empty()) return -100; w = square_blob_bordered.w; h = square_blob_bordered.h; } const int maxk = local_size * local_size; const float alpha_div_size = alpha / maxk; // norm window offsets std::vector<int> _space_ofs(maxk); int* space_ofs = &_space_ofs[0]; { int p1 = 0; int p2 = 0; int gap = w - local_size; for (int i = 0; i < local_size; i++) { for (int j = 0; j < local_size; j++) { space_ofs[p1] = p2; p1++; p2++; } p2 += gap; } } #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); const Mat m = square_blob_bordered.channel(q); float* outptr = top_blob.channel(q); for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) { const float* sptr = m.row(i) + j; float ss = 0.f; for (int k = 0; k < maxk; k++) { float val = sptr[ space_ofs[k] ]; ss += val; } outptr[j] = ptr[j] * pow(1.f + alpha_div_size * ss, -beta); } ptr += outw; outptr += outw; } } } return 0; }
int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob) const { // deconvolv with NxN kernel // value = value + bias int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; if (channels % group != 0 || num_output % group != 0) { // reject invalid group return -100; } const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; int outw = (w - 1) * stride_w + kernel_extent_w; int outh = (h - 1) * stride_h + kernel_extent_h; Mat top_blob_bordered = top_blob; top_blob_bordered.create(outw, outh, num_output); if (top_blob_bordered.empty()) return -100; const int maxk = kernel_w * kernel_h; // kernel offsets std::vector<int> _space_ofs(maxk); int* space_ofs = &_space_ofs[0]; { int p1 = 0; int p2 = 0; int gap = outw * dilation_h - kernel_w * dilation_w; for (int i = 0; i < kernel_h; i++) { for (int j = 0; j < kernel_w; j++) { space_ofs[p1] = p2; p1++; p2 += dilation_w; } p2 += gap; } } // depth-wise if (channels == group && group == num_output) { #pragma omp parallel for for (int g=0; g<group; g++) { const float* inptr = bottom_blob.channel(g); const float* kptr = (const float*)weight_data + maxk * g; Mat m = top_blob_bordered.channel(g); const float bias = bias_term ? bias_data[g] : 0.f; m.fill(bias); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { float* outptr = m.row(i*stride_h) + j*stride_w; for (int k = 0; k < maxk; k++) { float val = inptr[i*w + j]; float w = kptr[k]; outptr[ space_ofs[k] ] += val * w; } } } } } else { // num_output const int channels_g = channels / group; const int num_output_g = num_output / group; #pragma omp parallel for for (int g = 0; g < group; g++) { const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g; for (int p = 0; p < num_output_g; p++) { Mat out = top_blob_bordered.channel(g * num_output_g + p); const float bias = bias_term ? bias_data[g * num_output_g + p] : 0.f; out.fill(bias); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { float* outptr = out.row(i*stride_h) + j*stride_w; const float* kptr = weight_data_ptr + maxk * channels_g * p; // channels_g for (int q = 0; q < channels_g; q++) { const Mat m = bottom_blob.channel(channels_g * g + q); float val = *(m.row(i) + j); for (int k = 0; k < maxk; k++) { outptr[ space_ofs[k] ] += val * kptr[k]; } kptr += maxk; } } } } } } top_blob = top_blob_bordered; if (pad_w > 0 || pad_h > 0) { copy_cut_border(top_blob_bordered, top_blob, pad_h, pad_h, pad_w, pad_w); if (top_blob.empty()) return -100; outw = top_blob.w; outh = top_blob.h; } return 0; }