void HighPassFilter::applyLowPass(const Image2DPtr &image) { // Guassian convolution can be separated in two 1D convolution // because of properties of the 2D Gaussian function. Image2DPtr temp = Image2D::CreateZeroImagePtr(image->Width(), image->Height()); size_t hKernelMid = _hWindowSize/2; for(size_t i=0; i<_hWindowSize; ++i) { const num_t kernelValue = _hKernel[i]; const size_t xStart = (i >= hKernelMid) ? 0 : (hKernelMid-i), xEnd = (i <= hKernelMid) ? image->Width() : image->Width()-i+hKernelMid; for(unsigned y=0;y<image->Height();++y) { for(unsigned x=xStart;x<xEnd;++x) temp->AddValue(x, y, image->Value(x+i-hKernelMid, y)*kernelValue); } } image->SetAll(0.0); size_t vKernelMid = _vWindowSize/2; for(size_t i=0; i<_vWindowSize; ++i) { const num_t kernelValue = _vKernel[i]; const size_t yStart = (i >= vKernelMid) ? 0 : (vKernelMid-i), yEnd = (i <= vKernelMid) ? image->Height() : image->Height()-i+vKernelMid; for(unsigned y=yStart;y<yEnd;++y) { for(unsigned x=0;x<image->Width();++x) image->AddValue(x, y, temp->Value(x, y+i-vKernelMid)*kernelValue); } } }
void HighPassFilter::applyLowPassSSE(const Image2DPtr &image) { Image2DPtr temp = Image2D::CreateZeroImagePtr(image->Width(), image->Height()); unsigned hKernelMid = _hWindowSize/2; for(unsigned i=0; i<_hWindowSize; ++i) { const num_t k = _hKernel[i]; const __m128 k4 = _mm_set_ps(k, k, k, k); unsigned /* xStart is the first column to start writing to. Note that it might be larger * than the width. */ xStart = (i >= hKernelMid) ? 0 : (hKernelMid-i), xEnd = (i <= hKernelMid) ? image->Width() : (image->Width()+hKernelMid > i ? (image->Width()-i+hKernelMid) : 0); for(unsigned y=0;y<image->Height();++y) { float *tempPtr = temp->ValuePtr(xStart, y); const float *imagePtr = image->ValuePtr(xStart+i-hKernelMid, y); unsigned x = xStart; for(;x+4<xEnd;x+=4) { const __m128 imageVal = _mm_loadu_ps(imagePtr), tempVal = _mm_loadu_ps(tempPtr); // *tempPtr += k * (*imagePtr); _mm_storeu_ps(tempPtr, _mm_add_ps(tempVal, _mm_mul_ps(imageVal, k4))); tempPtr += 4; imagePtr += 4; } for(;x<xEnd;++x) { *tempPtr += k * (*imagePtr); ++tempPtr; ++imagePtr; } } } image->SetAll(0.0); unsigned vKernelMid = _vWindowSize/2; for(unsigned i=0; i<_vWindowSize; ++i) { const num_t k = _vKernel[i]; const __m128 k4 = _mm_set_ps(k, k, k, k); const unsigned yStart = (i >= vKernelMid) ? 0 : (vKernelMid-i), yEnd = (i <= vKernelMid) ? image->Height() : ((image->Height()+vKernelMid>i) ? (image->Height()-i+vKernelMid) : 0); for(unsigned y=yStart;y<yEnd;++y) { const float *tempPtr = temp->ValuePtr(0, y+i-vKernelMid); float *imagePtr = image->ValuePtr(0, y); unsigned x=0; for(;x+4<image->Width();x += 4) { const __m128 imageVal = _mm_load_ps(imagePtr), tempVal = _mm_load_ps(tempPtr); // *imagePtr += k * (*tempPtr); _mm_store_ps(imagePtr, _mm_add_ps(imageVal, _mm_mul_ps(tempVal, k4))); tempPtr += 4; imagePtr += 4; } for(;x<image->Width();++x) { *imagePtr += k * (*tempPtr); ++tempPtr; ++imagePtr; } } } }