AbstractBuffer<int32_t> ADCensus::constructDisparityMap(const AbstractBuffer<pixel> *leftImage, const AbstractBuffer<pixel> *rightImage, const AbstractBuffer<grayPixel> *leftGrayImage, const AbstractBuffer<grayPixel> *rightGrayImage) { // Initialization int width = leftImage->w; int height = leftImage->h; BaseTimeStatisticsCollector collector; Statistics outerStats; outerStats.setValue("H", height); outerStats.setValue("W", width); AbstractBuffer<int32_t> bestDisparities = AbstractBuffer<int32_t>(height, width); AbstractBuffer<COST_TYPE> minCosts = AbstractBuffer<COST_TYPE>(height, width); minCosts.fillWith(-1); // Disparity computation outerStats.startInterval(); AbstractBuffer<int64_t> leftCensus = AbstractBuffer<int64_t>(height, width); AbstractBuffer<int64_t> rightCensus = AbstractBuffer<int64_t>(height, width); makeCensus(leftGrayImage, leftCensus); makeCensus(rightGrayImage, rightCensus); outerStats.resetInterval("Making census"); makeAggregationCrosses(leftImage); outerStats.resetInterval("Making aggregation crosses"); for (uint i = 0; i < CORE_COUNT_OF(table1); i++) { table1[i] = robust(i, lambdaCT); table2[i] = robust(i, lambdaAD); } bool parallelDisp = true; parallelable_for(0, width / 3, [this, &minCosts, &bestDisparities, &leftImage, &rightImage, &leftCensus, &rightCensus, &collector, height, width, parallelDisp](const BlockedRange<int> &r) { for (int d = r.begin(); d != r.end(); ++d) { Statistics stats; stats.startInterval(); AbstractBuffer<COST_TYPE> costs = AbstractBuffer<COST_TYPE>(height, width); stats.resetInterval("Matrix construction"); parallelable_for(windowHh, height - windowHh, [this, &costs, &leftImage, &rightImage, &leftCensus, &rightCensus, d, width](const BlockedRange<int> &r) { for (int y = r.begin(); y != r.end(); ++y) { auto *im1 = &leftImage->element(y, windowWh + d); auto *im2 = &rightImage->element(y, windowWh); int64_t *cen1 = &leftCensus.element(y, windowWh + d); int64_t *cen2 = &rightCensus.element(y, windowWh); int x = windowWh + d; #ifdef WITH_SSE for (; x < width - windowWh; x += 8) { FixedVector<Int16x8, 4> c1 = SSEReader8BBBB_DDDD::read((uint32_t *)im1); FixedVector<Int16x8, 4> c2 = SSEReader8BBBB_DDDD::read((uint32_t *)im2); UInt16x8 dr = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_R]), UInt16x8(c2[RGBColor::FIELD_R])); UInt16x8 dg = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_G]), UInt16x8(c2[RGBColor::FIELD_G])); UInt16x8 db = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_B]), UInt16x8(c2[RGBColor::FIELD_B])); UInt16x8 ad = (dr + dg + db) >> 2; Int16x8 cost_ad = Int16x8(robustLUTAD(ad[0]), robustLUTAD(ad[1]), robustLUTAD(ad[2]), robustLUTAD(ad[3]), robustLUTAD(ad[4]), robustLUTAD(ad[5]), robustLUTAD(ad[6]), robustLUTAD(ad[7])); Int64x2 cen10(&cen1[0]); Int64x2 cen12(&cen1[2]); Int64x2 cen14(&cen1[4]); Int64x2 cen16(&cen1[6]); Int64x2 cen20(&cen2[0]); Int64x2 cen22(&cen2[2]); Int64x2 cen24(&cen2[4]); Int64x2 cen26(&cen2[6]); Int64x2 diff0 = cen10 ^ cen20; Int64x2 diff2 = cen12 ^ cen22; Int64x2 diff4 = cen14 ^ cen24; Int64x2 diff6 = cen16 ^ cen26; Int16x8 cost_ct(robustLUTCen(_mm_popcnt_u64(diff0.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff0.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(1))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(1)))); Int16x8 cost_total = cost_ad + cost_ct; for (int i = 0; i < 8; ++i) { costs.element(y, x + i) = cost_total[i]; } im1 += 8; im2 += 8; cen1+= 8; cen2+= 8; } #else for (; x < width - windowWh; ++x) { uint8_t c_ad = costAD(*im1, *im2); uint8_t c_census = hammingDist(*cen1, *cen2); costs.element(y, x) = robustLUTCen(c_census) + robustLUTAD(c_ad); im1 ++; im2 ++; cen1++; cen2++; } #endif } }, !parallelDisp ); stats.resetInterval("Cost computation"); aggregateCosts(&costs, windowWh + d, windowHh, width - windowWh, height - windowHh); stats.resetInterval("Cost aggregation"); for (int x = windowWh + d; x < width - windowWh; ++x) { for (int y = windowHh; y < height - windowHh; ++y) { tbb::mutex::scoped_lock(bestDisparitiesMutex); if(costs.element(y, x) < minCosts.element(y, x)) { minCosts.element(y, x) = costs.element(y, x); bestDisparities.element(y, x) = d; //result.element(y,x) = (bestDisparities.element(y, x) / (double)width * 255 * 3); } } } //BMPLoader().save("../../result.bmp", result); stats.endInterval("Comparing with previous minimum"); collector.addStatistics(stats); } }, parallelDisp);
void RGB24Buffer::fillWithYUYV (uint8_t *yuyv) { for (int i = 0; i < h; i++) { int j = 0; #ifdef WITH_SSE const int span = SSEReader8BBBBBBBB_DDDDDDDD::BYTE_STEP / sizeof(RGBColor); /* Checking that we have a full span to read */ for (; j + span <= w ; j += span) { FixedVector<Int16x8,4> r = SSEReader8BBBB_DDDD::read((uint32_t *)yuyv); Int16x8 cy1 = r[0] - Int16x8((uint16_t) 16); Int16x8 cu = r[1] - Int16x8((uint16_t)128); Int16x8 cy2 = r[2] - Int16x8((uint16_t) 16); Int16x8 cv = r[3] - Int16x8((uint16_t)128); Int16x8 con0 ((int16_t)0); Int16x8 con255((int16_t)0xFF); /* coefficients */ /* This is a hack to fit into 16bit register */ Int16x8 con100((uint16_t)(100 / 4)); Int16x8 con128((uint16_t)(128 / 4)); Int16x8 con208((uint16_t)(208 / 4)); Int16x8 con298((uint16_t)(298 / 4)); Int16x8 con516((uint16_t)(516 / 4)); Int16x8 con409((uint16_t)(409 / 4)); FixedVector<Int16x8, 8> result; enum { B1, G1, R1, ZERO1, B2, G2, R2, ZERO2 }; Int16x8 dr = con128 + con409 * cv; Int16x8 dg = con128 - con100 * cu - con208 * cv; Int16x8 db = con128 + con516 * cu ; Int16x8 dy1 = con298 * cy1; Int16x8 dy2 = con298 * cy2; result[R1] = (dy1 + dr) >> 6; result[G1] = (dy1 + dg) >> 6; result[B1] = (dy1 + db) >> 6; result[ZERO1] = Int16x8((int16_t)0); result[R2] = (dy2 + dr) >> 6; result[G2] = (dy2 + dg) >> 6; result[B2] = (dy2 + db) >> 6; result[ZERO2] = Int16x8((int16_t)0); #ifdef USE_NONUNROLLED_LOOP /* TODO: Use saturated arithmetics instead probably*/ for (int k = B1; k < ZERO1; k++) { result[k] = SSEMath::selector(result[k] > con255, con255 , result[k]); result[k] = SSEMath::selector(result[k] > con0 , result[k], con0 ); int k1 = k + B2; result[k1] = SSEMath::selector(result[k1] > con255, con255 , result[k1]); result[k1] = SSEMath::selector(result[k1] > con0 , result[k1], con0 ); } #else result[R1] = SSEMath::selector(result[R1] > con255, con255 , result[R1]); result[R1] = SSEMath::selector(result[R1] > con0 , result[R1], con0 ); result[G1] = SSEMath::selector(result[G1] > con255, con255 , result[G1]); result[G1] = SSEMath::selector(result[G1] > con0 , result[G1], con0 ); result[B1] = SSEMath::selector(result[B1] > con255, con255 , result[B1]); result[B1] = SSEMath::selector(result[B1] > con0 , result[B1], con0 ); result[R2] = SSEMath::selector(result[R2] > con255, con255 , result[R2]); result[R2] = SSEMath::selector(result[R2] > con0 , result[R2], con0 ); result[G2] = SSEMath::selector(result[G2] > con255, con255 , result[G2]); result[G2] = SSEMath::selector(result[G2] > con0 , result[G2], con0 ); result[B2] = SSEMath::selector(result[B2] > con255, con255 , result[B2]); result[B2] = SSEMath::selector(result[B2] > con0 , result[B2], con0 ); #endif SSEReader8BBBBBBBB_DDDDDDDD::write(result, (uint32_t *)&element(i,j)); yuyv += SSEReader8BBBB_DDDD::BYTE_STEP; } #endif for (; j + 2 <= w; j+=2) { int y1 = yuyv[0]; int u = yuyv[1]; int y2 = yuyv[2]; int v = yuyv[3]; int cy1 = y1 - 16; int cu = u - 128; int cy2 = y2 - 16; int cv = v - 128; int r1 = ((298 * cy1 + 409 * cv + 128) >> 8); int g1 = ((298 * cy1 - 100 * cu - 208 * cv + 128) >> 8); int b1 = ((298 * cy1 + 516 * cu + 128) >> 8); if (r1 > 255) r1 = 255; if (r1 < 0) r1 = 0; if (g1 > 255) g1 = 255; if (g1 < 0) g1 = 0; if (b1 > 255) b1 = 255; if (b1 < 0) b1 = 0; int r2 = ((298 * cy2 + 409 * cv + 128) >> 8); int g2 = ((298 * cy2 - 100 * cu - 208 * cv + 128) >> 8); int b2 = ((298 * cy2 + 516 * cu + 128) >> 8); if (r2 > 255) r2 = 255; if (r2 < 0) r2 = 0; if (g2 > 255) g2 = 255; if (g2 < 0) g2 = 0; if (b2 > 255) b2 = 255; if (b2 < 0) b2 = 0; element(i,j) = RGBColor(r1,g1,b1); element(i,j + 1) = RGBColor(r2,g2,b2); yuyv += 4; } } }