void calcDepthOptimized(float *depth, float *left, float *right, int imageWidth, int imageHeight, int featureWidth, int featureHeight, int maximumDisplacement)
{
	/* The two outer for loops iterate through each pixel */
	memset(depth, 0, imageWidth * imageHeight * sizeof(float));
	int unroll = (2 * featureWidth + 1)/4 * 4;
	int tail = 2 * featureWidth + 1;
	#pragma omp parallel for
	for (int y = 0; y < imageHeight; y++)
	{
		for (int x = 0; x < imageWidth; x++)
		{	
			/* Set the depth to 0 if looking at edge of the image where a feature box cannot fit. */
			if ((y < featureHeight) || (y >= imageHeight - featureHeight) || (x < featureWidth) || (x >= imageWidth - featureWidth))
			{
				depth[y * imageWidth + x] = 0;
				continue;
			}

			float minimumSquaredDifference = -1;
			int minimumDy = 0;
			int minimumDx = 0;
			int lowerBoundx, upperBoundx, lowerBoundy, upperBoundy;
			if (-maximumDisplacement > featureWidth - x) {
				lowerBoundx = -maximumDisplacement;
			}
			else {
				lowerBoundx = featureWidth - x;
			}
			if (maximumDisplacement < imageWidth - featureWidth - x - 1) {
				upperBoundx = maximumDisplacement;
			}
			else {
				upperBoundx = imageWidth - featureWidth - x - 1;
			}
			if (-maximumDisplacement > featureHeight - y) {
				lowerBoundy = -maximumDisplacement;
			}
			else {
				lowerBoundy = featureHeight - y;
			}
			if (maximumDisplacement < imageHeight - featureHeight - y - 1) {
				upperBoundy = maximumDisplacement;
			}
			else {
				upperBoundy = imageHeight - featureHeight - y - 1;
			}

			/* Iterate through all feature boxes that fit inside the maximum displacement box. 
			   centered around the current pixel. */ 
			for (int dy = lowerBoundy; dy <= upperBoundy; dy++)
			{
				for (int dx = lowerBoundx; dx <= upperBoundx; dx++)
				{
					float squaredDifference = 0;
					float total[4];
					__m128 tempSum = _mm_setzero_ps();
					__m128 leftV;
					__m128 rightV;
					__m128 diffV;
					__m128 squareV;

					//Unroll by 4
					for (int boxX = 0; boxX < unroll; boxX += 4) {
						for (int boxY = -featureHeight; boxY <= featureHeight; boxY++) {
							int leftI = (x + boxX - featureWidth) + imageWidth * (y + boxY);
							int rightI = (x + dx + boxX - featureWidth) + imageWidth * (y + dy + boxY);
							leftV = _mm_loadu_ps(leftI + left);
							rightV = _mm_loadu_ps(rightI + right);
							diffV = _mm_sub_ps(leftV, rightV);
							squareV = _mm_mul_ps(diffV, diffV);
							tempSum = _mm_add_ps(squareV, tempSum);
						}
					}

					/* Sum the squared difference within a box of +/- featureHeight and +/- featureWidth. */
					for (int boxX = unroll; boxX < tail; boxX++)
					{
						for (int boxY = -featureHeight; boxY <= featureHeight; boxY++)
						{
							int leftX = x + boxX - featureWidth;
							int leftY = y + boxY;
							int rightX = x + dx + boxX - featureWidth;
							int rightY = y + dy + boxY;

							float difference = left[leftY * imageWidth + leftX] - right[rightY * imageWidth + rightX];
							squaredDifference += difference * difference;
						}
					}

					_mm_storeu_ps(total, tempSum);
					squaredDifference += total[0] + total[1] + total[2] + total[3];

					/* 
					Check if you need to update minimum square difference. 
					This is when either it has not been set yet, the current
					squared displacement is equal to the min and but the new
					displacement is less, or the current squared difference
					is less than the min square difference.
					*/

					if ((minimumSquaredDifference == -1) || ((minimumSquaredDifference == squaredDifference) && (displacementNaive(dx, dy) < displacementNaive(minimumDx, minimumDy))) || (minimumSquaredDifference > squaredDifference))
					{
						minimumSquaredDifference = squaredDifference;
						minimumDx = dx;
						minimumDy = dy;
					}
				}
			}

			/* 
			Set the value in the depth map. 
			If max displacement is equal to 0, the depth value is just 0.
			*/
			if (minimumSquaredDifference != -1)
			{
				if (maximumDisplacement == 0)
				{
					depth[y * imageWidth + x] = 0;
				}
				else
				{
					depth[y * imageWidth + x] = displacementNaive(minimumDx, minimumDy);
				}
			}
			else
			{
				depth[y * imageWidth + x] = 0;
			}
		}
	}
}
void calcDepthOptimized(float *depth, float *left, float *right, int imageWidth, int imageHeight, int featureWidth, int featureHeight, int maximumDisplacement)
{	
	for (int x = 0; x < imageWidth; x++)
	{
		for (int y = 0; y < imageHeight; y++)
		{
			if ((y < featureHeight) || (y >= imageHeight - featureHeight) || (x < featureWidth) || (x >= imageWidth - featureWidth))
			{
				depth[y * imageWidth + x] = 0;
				continue;
			}

			float minimumSquaredDifference = -1;
			int minimumDy = 0;
			int minimumDx = 0;
			for (int dx = -maximumDisplacement; dx <= maximumDisplacement; dx++)
			{
				for (int dy = -maximumDisplacement; dy <= maximumDisplacement; dy++)
				{
					if (y + dy - featureHeight < 0 || y + dy + featureHeight >= imageHeight || x + dx - featureWidth < 0 || x + dx + featureWidth >= imageWidth)
					{
						continue;
					}
					int boxX;
					float squaredDifference = 0;
					__m128 sum = _mm_setzero_ps();
					float squaredDifferenceInt[4];
					/*for (boxX = -featureWidth; boxX + 8 <= featureWidth; boxX+= 8)
					{
						for (int boxY = -featureHeight; boxY <= featureHeight; boxY++)
						{
							int leftX = x + boxX;
							int leftY = y + boxY;
							int rightX = x + dx + boxX;
							int rightY = y + dy + boxY;
							__m128 left1 = _mm_loadu_ps(left + (leftY * imageWidth + leftX));
							__m128 right1 = _mm_loadu_ps(right + (rightY * imageWidth + rightX));
							__m128 left2 = 	_mm_loadu_ps(left + (leftY * imageWidth + leftX) + 4);	
							__m128 right2 = _mm_loadu_ps(right + (rightY * imageWidth + rightX) + 4);	
							__m128 sqdiff = _mm_sub_ps(left1, right1);
							sqdiff = _mm_mul_ps(sqdiff, sqdiff);
							sum = _mm_add_ps(sqdiff, sum);
							sqdiff = _mm_sub_ps(left2, right2);
							sqdiff = _mm_mul_ps(sqdiff, sqdiff);
							sum = _mm_add_ps(sqdiff, sum);
						}
					}
					_mm_storeu_ps(squaredDifferenceInt, sum);
					squaredDifference += squaredDifferenceInt[0] + squaredDifferenceInt[1] + squaredDifferenceInt[2] + squaredDifferenceInt[3];
					if (squaredDifference > minimumSquaredDifference && minimumSquaredDifference != -1) {
						continue;
					} */
					for (int boxX2 = -featureWidth; boxX2 + 4 <= featureWidth; boxX2+= 4)
					{
						for (int boxY = -featureHeight; boxY <= featureHeight; boxY++)
						{
							int leftX = x + boxX2;
							int leftY = y + boxY;
							int rightX = x + dx + boxX2;
							int rightY = y + dy + boxY;
							__m128 left2 = _mm_loadu_ps(left + (leftY * imageWidth + leftX));
							__m128 right2 = _mm_loadu_ps(right + (rightY * imageWidth + rightX));			
							__m128 sqdiff = _mm_sub_ps(left2, right2);
							sqdiff = _mm_mul_ps(sqdiff, sqdiff);
							sum = _mm_add_ps(sqdiff, sum);
						}
					}
					_mm_storeu_ps(squaredDifferenceInt, sum);
					squaredDifference += squaredDifferenceInt[0] + squaredDifferenceInt[1] + squaredDifferenceInt[2] + squaredDifferenceInt[3];
					if (squaredDifference > minimumSquaredDifference && minimumSquaredDifference != -1) {
						continue;
					}
					else if (featureWidth % 2 == 0) {
						for (int j = -featureHeight; j <= featureHeight; j++) {
							int leftX = x + featureWidth;
							int leftY = y + j;
							int rightX = x + dx + featureWidth;
							int rightY = y + dy + j;
							float difference = left[leftY * imageWidth + leftX] - right[rightY * imageWidth + rightX];
							squaredDifference += difference * difference;
						}
					}
					else {
						__m128 sum = _mm_setzero_ps();
						for (int j = -featureHeight; j <= featureHeight; j++) {
							int leftX = x + featureWidth - 3;
							int leftY = y + j;
							int rightX = x + dx + featureWidth - 3;
							int rightY = y + dy + j;
							__m128 left2 = _mm_loadu_ps(left + (leftY * imageWidth + leftX));
							__m128 right2 = _mm_loadu_ps(right + (rightY * imageWidth + rightX));
							__m128 sqdiff = _mm_sub_ps(left2, right2);
							sqdiff = _mm_mul_ps(sqdiff, sqdiff);
							sum = _mm_add_ps(sqdiff, sum);
						}
						_mm_storeu_ps(squaredDifferenceInt, sum);
						squaredDifference += squaredDifferenceInt[1] + squaredDifferenceInt[2] + squaredDifferenceInt[3];
					}
					if ((minimumSquaredDifference == -1) || ((minimumSquaredDifference == squaredDifference) && (displacementNaive(dx, dy) < displacementNaive(minimumDx, minimumDy))) || (minimumSquaredDifference > squaredDifference))
					{
						minimumSquaredDifference = squaredDifference;
						minimumDx = dx;
						minimumDy = dy;
					}
				}
			}
			if (minimumSquaredDifference != -1 && maximumDisplacement != 0)
			{
				depth[y * imageWidth + x] = displacementNaive(minimumDx, minimumDy);
			}
			else
			{
				depth[y * imageWidth + x] = 0;
			}
		}
	}
}
void calcDepth(float *depth, float *left, float *right, int imageWidth, int imageHeight, int featureWidth, int featureHeight, int maximumDisplacement, size_t* floatOps)
{
	if (floatOps != NULL)
	{
		*floatOps = 0;
	}

	for (int y = 0; y < imageHeight; y++)
	{
		for (int x = 0; x < imageWidth; x++)
		{
			if ((y < featureHeight) || (y >= imageHeight - featureHeight) || (x < featureWidth) || (x >= imageWidth - featureWidth))
			{
				depth[y * imageWidth + x] = 0;
				continue;
			}

			float minimumSquaredDifference = -1;
			int minimumDy = 0;
			int minimumDx = 0;

			for (int dy = -maximumDisplacement; dy <= maximumDisplacement; dy++)
			{
				for (int dx = -maximumDisplacement; dx <= maximumDisplacement; dx++)
				{
					if (y + dy - featureHeight < 0 || y + dy + featureHeight >= imageHeight || x + dx - featureWidth < 0 || x + dx + featureWidth >= imageWidth)
					{
						continue;
					}

					float squaredDifference = 0;

					for (int boxY = -featureHeight; boxY <= featureHeight; boxY++)
					{
						for (int boxX = -featureWidth; boxX <= featureWidth; boxX++)
						{
							int leftX = x + boxX;
							int leftY = y + boxY;
							int rightX = x + dx + boxX;
							int rightY = y + dy + boxY;

							float difference = left[leftY * imageWidth + leftX] - right[rightY * imageWidth + rightX];
							squaredDifference += difference * difference;

							if (floatOps != NULL)
							{
								*floatOps += 3;
							}
						}
					}

					if ((minimumSquaredDifference == -1) || ((minimumSquaredDifference == squaredDifference) && (displacementNaive(dx, dy) < displacementNaive(minimumDx, minimumDy))) || (minimumSquaredDifference > squaredDifference))
					{
						minimumSquaredDifference = squaredDifference;
						minimumDx = dx;
						minimumDy = dy;
					}
				}
			}

			if (minimumSquaredDifference != -1)
			{
				if (maximumDisplacement == 0)
				{
					depth[y * imageWidth + x] = 0;
				}
				else
				{
					depth[y * imageWidth + x] = displacementNaive(minimumDx, minimumDy);
				}
			}
			else
			{
				depth[y * imageWidth + x] = 0;
			}
		}
	}
}