int main(void) { __m128d y2 = _mm_setr_pd(1,2); __m128d y3 = _mm_setr_pd(1,2); int r = _mm_testz_pd(y2, y3); printf("%d\n", r); y2 = _mm_setr_pd(-1,-2); y3 = _mm_setr_pd(-1,-2); r = _mm_testz_pd(y2, y3); printf("%d\n", r); __m256d y0 = _mm256_setr_pd(1,2,3,4); __m256d y1 = _mm256_setr_pd(1,2,3,4); r = _mm256_testz_pd(y0, y1); printf("%d\n", r); //y1 = _mm256_setr_pd(11,2,3,4); y0 = _mm256_setr_pd(-1,-2,-3,-4); y1 = _mm256_setr_pd(-1,-2,-3,-4); r = _mm256_testz_pd(y0, y1); printf("%d\n", r); return 0; }
inline float64x4_t conjugate(const float64x4_t ymm) { static const float64x4_t sign_mask = _mm256_setr_pd(+0.0, -0.0, -0.0, -0.0); float64x4_t xor0 = _mm256_xor_pd(ymm, sign_mask); return xor0; }
int main(void) { //_mm256_permute_pd __m256d da = _mm256_setr_pd(1,2,3,4); printf("da: "); for(int i=0; i<sizeof(da)/sizeof(da.m256d_f64[0]); i++) printf("%5.1f ", da.m256d_f64[i]); printf("\n"); __m256d dc = _mm256_permute_pd(da, 0x02); printf("dc: "); for(int i=0; i<sizeof(dc)/sizeof(dc.m256d_f64[0]); i++) printf("%5.1f ", dc.m256d_f64[i]); printf("\n\n"); //_mm_permute_pd __m128d fa = _mm_setr_pd(1, 2); printf("fa: "); for(int i=0; i<sizeof(fa)/sizeof(fa.m128d_f64[0]); i++) printf("%5.1f ", fa.m128d_f64[i]); printf("\n"); __m128d fc = _mm_permute_pd(fa,0x01); printf("fc: "); for(int i=0; i<sizeof(fc)/sizeof(fc.m128d_f64[0]); i++) printf("%5.1f ", fc.m128d_f64[i]); printf("\n"); return 0; }
inline vector4d::vector4d(double d0, double d1, double d2, double d3) : m_value(_mm256_setr_pd(d0, d1, d2, d3)) { }
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator) { AVX2TABLETYPE *mp; DBL sum = 0.0; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator == kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } const __m256d ONE_PD = _mm256_set1_pd(1); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d sumr = _mm256_setzero_pd(); __m256d sumr1 = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj); { sumr = _mm256_add_pd(sumr, sumr1); __m128d sumr_up = _mm256_extractf128_pd(sumr,1); sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up); sumr_up = _mm_hadd_pd(sumr_up,sumr_up); sum = _mm_cvtsd_f64(sumr_up); } if (noise_generator == kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ sum += 1.05242; sum *= 0.48985582; /*sum *= 0.5; sum += 0.5;*/ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } else { sum = sum + 0.5; /* range at this point -0.5 - 0.5... */ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } #if CHECK_FUNCTIONAL { DBL orig_sum = PortableNoise(EPoint, noise_generator); if (fabs(orig_sum - sum) >= EPSILON) { throw POV_EXCEPTION_STRING("Noise error"); } } #endif _mm256_zeroupper(); return (sum); }
void Projector::rotate3D(MultidimArray<Complex > &f3d, Matrix2D<DOUBLE> &A, bool inv) { DOUBLE fx, fy, fz, xp, yp, zp; int x0, x1, y0, y1, z0, z1, y, z, y2, z2, r2; bool is_neg_x; Complex d000, d010, d100, d110, d001, d011, d101, d111, dx00, dx10, dxy0, dx01, dx11, dxy1; Matrix2D<DOUBLE> Ainv; // f3d should already be in the right size (ori_size,orihalfdim) // AND the points outside max_r should already be zero... // f3d.initZeros(); // Use the inverse matrix if (inv) Ainv = A; else Ainv = A.transpose(); // The f3d image may be smaller than r_max, in that case also make sure not to fill the corners! int my_r_max = XMIPP_MIN(r_max, XSIZE(f3d) - 1); // Go from the 3D rotated coordinates to the original map coordinates Ainv *= (DOUBLE)padding_factor; // take scaling into account directly int max_r2 = my_r_max * my_r_max; int min_r2_nn = r_min_nn * r_min_nn; #ifdef DEBUG std::cerr << " XSIZE(f3d)= "<< XSIZE(f3d) << std::endl; std::cerr << " YSIZE(f3d)= "<< YSIZE(f3d) << std::endl; std::cerr << " XSIZE(data)= "<< XSIZE(data) << std::endl; std::cerr << " YSIZE(data)= "<< YSIZE(data) << std::endl; std::cerr << " STARTINGX(data)= "<< STARTINGX(data) << std::endl; std::cerr << " STARTINGY(data)= "<< STARTINGY(data) << std::endl; std::cerr << " STARTINGZ(data)= "<< STARTINGZ(data) << std::endl; std::cerr << " max_r= "<< r_max << std::endl; std::cerr << " Ainv= " << Ainv << std::endl; #endif for (int k=0; k < ZSIZE(f3d); k++) { // Don't search beyond square with side max_r if (k <= my_r_max) { z = k; } else if (k >= ZSIZE(f3d) - my_r_max) { z = k - ZSIZE(f3d); } else continue; z2 = z * z; for (int i=0; i < YSIZE(f3d); i++) { // Don't search beyond square with side max_r if (i <= my_r_max) { y = i; } else if (i >= YSIZE(f3d) - my_r_max) { y = i - YSIZE(f3d); } else continue; y2 = y * y; for (int x=0; x <= my_r_max; x++) { // Only include points with radius < max_r (exclude points outside circle in square) r2 = x * x + y2 + z2; if (r2 > max_r2) continue; // Get logical coordinates in the 3D map xp = Ainv(0,0) * x + Ainv(0,1) * y + Ainv(0,2) * z; yp = Ainv(1,0) * x + Ainv(1,1) * y + Ainv(1,2) * z; zp = Ainv(2,0) * x + Ainv(2,1) * y + Ainv(2,2) * z; if (interpolator == TRILINEAR || r2 < min_r2_nn) { // Only asymmetric half is stored if (xp < 0) { // Get complex conjugated hermitian symmetry pair xp = -xp; yp = -yp; zp = -zp; is_neg_x = true; } else { is_neg_x = false; } // Trilinear interpolation (with physical coords) // Subtract STARTINGY to accelerate access to data (STARTINGX=0) // In that way use DIRECT_A3D_ELEM, rather than A3D_ELEM x0 = FLOOR(xp); fx = xp - x0; x1 = x0 + 1; y0 = FLOOR(yp); fy = yp - y0; y0 -= STARTINGY(data); y1 = y0 + 1; z0 = FLOOR(zp); fz = zp - z0; z0 -= STARTINGZ(data); z1 = z0 + 1; // Matrix access can be accelerated through pre-calculation of z0*xydim etc. d000 = DIRECT_A3D_ELEM(data, z0, y0, x0); d001 = DIRECT_A3D_ELEM(data, z0, y0, x1); d010 = DIRECT_A3D_ELEM(data, z0, y1, x0); d011 = DIRECT_A3D_ELEM(data, z0, y1, x1); d100 = DIRECT_A3D_ELEM(data, z1, y0, x0); d101 = DIRECT_A3D_ELEM(data, z1, y0, x1); d110 = DIRECT_A3D_ELEM(data, z1, y1, x0); d111 = DIRECT_A3D_ELEM(data, z1, y1, x1); // Set the interpolated value in the 2D output array // interpolate in x #ifndef FLOAT_PRECISION __m256d __fx = _mm256_set1_pd(fx); __m256d __interpx1 = LIN_INTERP_AVX(_mm256_setr_pd(d000.real, d000.imag, d100.real, d100.imag), _mm256_setr_pd(d001.real, d001.imag, d101.real, d101.imag), __fx); __m256d __interpx2 = LIN_INTERP_AVX(_mm256_setr_pd(d010.real, d010.imag, d110.real, d110.imag), _mm256_setr_pd(d011.real, d011.imag, d111.real, d111.imag), __fx); // interpolate in y __m256d __fy = _mm256_set1_pd(fy); __m256d __interpy = LIN_INTERP_AVX(__interpx1, __interpx2, __fy); #else __m128 __fx = _mm_set1_ps(fx); __m128 __interpx1 = LIN_INTERP_AVX(_mm_setr_ps(d000.real, d000.imag, d100.real, d100.imag), _mm_setr_ps(d001.real, d001.imag, d101.real, d101.imag), __fx); __m128 __interpx2 = LIN_INTERP_AVX(_mm_setr_ps(d010.real, d010.imag, d110.real, d110.imag), _mm_setr_ps(d011.real, d011.imag, d111.real, d111.imag), __fx); // interpolate in y __m128 __fy = _mm_set1_ps(fy); __m128 __interpy = LIN_INTERP_AVX(__interpx1, __interpx2, __fy); #endif Complex* interpy = (Complex*)&__interpy; //interpolate in z DIRECT_A3D_ELEM(f3d, k, i, x) = LIN_INTERP(fz, interpy[0], interpy[1]); // Take complex conjugated for half with negative x if (is_neg_x) DIRECT_A3D_ELEM(f3d, k, i, x) = conj(DIRECT_A3D_ELEM(f3d, k, i, x)); } // endif TRILINEAR else if (interpolator == NEAREST_NEIGHBOUR ) { x0 = ROUND(xp); y0 = ROUND(yp); z0 = ROUND(zp); if (x0 < 0) DIRECT_A3D_ELEM(f3d, k, i, x) = conj(A3D_ELEM(data, -z0, -y0, -x0)); else DIRECT_A3D_ELEM(f3d, k, i, x) = A3D_ELEM(data, z0, y0, x0); } // endif NEAREST_NEIGHBOUR else REPORT_ERROR("Unrecognized interpolator in Projector::project"); } // endif x-loop } // endif y-loop } // endif z-loop }
FORCE_INLINE void convolve_block(double* im, int ii, int jj, int N, uint32_t r, uint32_t c, double* dst){ const double rw = 0.2989; const double gw = 0.5870; const double bw = 0.1140; const __m256d rgb0W = _mm256_setr_pd (rw,gw,bw,0); //rgb0 const __m256d onehalf = _mm256_set1_pd(0.5); const __m256d minustwelvehalf = _mm256_set1_pd(-12.5); int64_t hi_bit_set = ((int64_t)1) << 63; const __m256i mask1110 = _mm256_setr_epi64x(hi_bit_set,hi_bit_set, hi_bit_set, 0); //determine non-special case boundaries int i_start = ii; int i_end = ii+N; int j_start = jj; int j_end = jj+N; if( ii == 0){ i_start = 1; } if( ii + N == r){ i_end = r-1; } if( jj == 0){ j_start = 1; } if( jj + N == c){ j_end = c-1; } if(ii == 0){ if( jj == 0){ //top left int i = 0; int j = 0; int center = i*c+j; int bottom = (i+1)*c+j; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*center,3*center,3*right,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } if( jj + N == c){ //top right int i = 0; int j = c-1; int center = i*c+j; int bottom = (i+1)*c+j; int left = i*c+j-1; dst[center] = single_pixel(im, 3*center, 3*center,3*left,3*center,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } //need to fill in top row int i = 0; for(int j = j_start; j < j_end; j++){ int center = i*c+j; int bottom = (i+1)*c+j; int left = i*c+j-1; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*center,3*left,3*right,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } } if(ii + N == r){ if( jj == 0){ //bottom left int i = r-1; int j = 0; int center = i*c+j; int top = (i-1)*c+j; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*top,3*center,3*right,3*center, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } if( jj + N == c){ //bottom right int i = r-1; int j = c-1; int center = i*c+j; int top = (i-1)*c+j; int left = i*c+j-1; dst[center] = single_pixel(im, 3*center, 3*top,3*left,3*center,3*center, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } //need to fill in bottom row int i = r-1; for(int j = j_start; j < j_end; j++){ int center = i*c+j; int top = (i-1)*c+j; int left = i*c+j-1; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*top,3*left,3*right,3*center, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } } if( jj == 0){ //need to fill in left edge int j = 0; for(int i = i_start; i < i_end; i++){ int center = i*c+j; int top = (i-1)*c+j; int bottom = (i+1)*c+j; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*top,3*center,3*right,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } } if( jj + N == c){ //need to fill in right edge int j = c-1; for(int i = i_start; i < i_end; i++){ int center = i*c+j; int top = (i-1)*c+j; int bottom = (i+1)*c+j; int left = i*c+j-1; dst[center] = single_pixel(im, 3*center, 3*top,3*left,3*center,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } } //now we can safely process everything inside the _start and _end margins //read pixels from _start-1 until _end+1 and process all inside for(int i = i_start; i < i_end; i++){ for(int j = j_start; j < j_end; j++){ int center = i*c+j; int top = (i-1)*c+j; int bottom = (i+1)*c+j; int left = i*c+j-1; int right = i*c+j+1; dst[center] = single_pixel(im, 3*center, 3*top,3*left,3*right,3*bottom, mask1110,rgb0W,onehalf,minustwelvehalf); COST_INC_LOAD(1); COST_INC_STORE(1); } } }