AbstractBuffer<int32_t> ADCensus::constructDisparityMap(const AbstractBuffer<pixel> *leftImage, const AbstractBuffer<pixel> *rightImage,
                                                        const AbstractBuffer<grayPixel> *leftGrayImage, const AbstractBuffer<grayPixel> *rightGrayImage) {
    // Initialization

    int width = leftImage->w;
    int height = leftImage->h;

    BaseTimeStatisticsCollector collector;
    Statistics outerStats;
    outerStats.setValue("H", height);
    outerStats.setValue("W", width);

    AbstractBuffer<int32_t> bestDisparities = AbstractBuffer<int32_t>(height, width);
    AbstractBuffer<COST_TYPE> minCosts = AbstractBuffer<COST_TYPE>(height, width);
    minCosts.fillWith(-1);

    // Disparity computation
    outerStats.startInterval();

    AbstractBuffer<int64_t> leftCensus  = AbstractBuffer<int64_t>(height, width);
    AbstractBuffer<int64_t> rightCensus = AbstractBuffer<int64_t>(height, width);
    makeCensus(leftGrayImage, leftCensus);
    makeCensus(rightGrayImage, rightCensus);
    outerStats.resetInterval("Making census");

    makeAggregationCrosses(leftImage);
    outerStats.resetInterval("Making aggregation crosses");

    for (uint i = 0; i < CORE_COUNT_OF(table1); i++)
    {
        table1[i] = robust(i, lambdaCT);
        table2[i] = robust(i, lambdaAD);
    }

    bool parallelDisp = true;

    parallelable_for(0, width / 3,
                     [this, &minCosts, &bestDisparities, &leftImage, &rightImage,
                     &leftCensus, &rightCensus, &collector, height, width, parallelDisp](const BlockedRange<int> &r)
    {
        for (int d = r.begin(); d != r.end(); ++d) {
            Statistics stats;
            stats.startInterval();
            AbstractBuffer<COST_TYPE> costs = AbstractBuffer<COST_TYPE>(height, width);
            stats.resetInterval("Matrix construction");

            parallelable_for(windowHh, height - windowHh,
                             [this, &costs, &leftImage, &rightImage, &leftCensus, &rightCensus, d, width](const BlockedRange<int> &r)
            {
                for (int y = r.begin(); y != r.end(); ++y) {
                    auto *im1 = &leftImage->element(y, windowWh + d);
                    auto *im2 = &rightImage->element(y, windowWh);

                    int64_t *cen1 = &leftCensus.element(y, windowWh + d);
                    int64_t *cen2 = &rightCensus.element(y, windowWh);

                    int x = windowWh + d;

#ifdef WITH_SSE
                    for (; x < width - windowWh; x += 8) {
                        FixedVector<Int16x8, 4> c1 = SSEReader8BBBB_DDDD::read((uint32_t *)im1);
                        FixedVector<Int16x8, 4> c2 = SSEReader8BBBB_DDDD::read((uint32_t *)im2);

                        UInt16x8 dr = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_R]), UInt16x8(c2[RGBColor::FIELD_R]));
                        UInt16x8 dg = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_G]), UInt16x8(c2[RGBColor::FIELD_G]));
                        UInt16x8 db = SSEMath::difference(UInt16x8(c1[RGBColor::FIELD_B]), UInt16x8(c2[RGBColor::FIELD_B]));

                        UInt16x8 ad = (dr + dg + db) >> 2;
                        Int16x8 cost_ad = Int16x8(robustLUTAD(ad[0]),
                                                  robustLUTAD(ad[1]),
                                                  robustLUTAD(ad[2]),
                                                  robustLUTAD(ad[3]),
                                                  robustLUTAD(ad[4]),
                                                  robustLUTAD(ad[5]),
                                                  robustLUTAD(ad[6]),
                                                  robustLUTAD(ad[7]));

                        Int64x2 cen10(&cen1[0]);
                        Int64x2 cen12(&cen1[2]);
                        Int64x2 cen14(&cen1[4]);
                        Int64x2 cen16(&cen1[6]);

                        Int64x2 cen20(&cen2[0]);
                        Int64x2 cen22(&cen2[2]);
                        Int64x2 cen24(&cen2[4]);
                        Int64x2 cen26(&cen2[6]);

                        Int64x2 diff0 = cen10 ^ cen20;
                        Int64x2 diff2 = cen12 ^ cen22;
                        Int64x2 diff4 = cen14 ^ cen24;
                        Int64x2 diff6 = cen16 ^ cen26;

                        Int16x8 cost_ct(robustLUTCen(_mm_popcnt_u64(diff0.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff0.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff2.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff2.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff4.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff4.getInt(1))),
                                        robustLUTCen(_mm_popcnt_u64(diff6.getInt(0))), robustLUTCen(_mm_popcnt_u64(diff6.getInt(1))));

                        Int16x8 cost_total = cost_ad + cost_ct;
                        for (int i = 0; i < 8; ++i) {
                            costs.element(y, x + i) = cost_total[i];
                        }

                        im1 += 8;
                        im2 += 8;
                        cen1+= 8;
                        cen2+= 8;
                    }
#else
                    for (; x < width - windowWh; ++x) {
                        uint8_t c_ad = costAD(*im1, *im2);
                        uint8_t c_census = hammingDist(*cen1, *cen2);

                        costs.element(y, x) = robustLUTCen(c_census) + robustLUTAD(c_ad);

                        im1 ++;
                        im2 ++;
                        cen1++;
                        cen2++;
                    }
#endif
                }
            }, !parallelDisp
            );

            stats.resetInterval("Cost computation");

            aggregateCosts(&costs, windowWh + d, windowHh, width - windowWh, height - windowHh);

            stats.resetInterval("Cost aggregation");

            for (int x = windowWh + d; x < width - windowWh; ++x) {
                for (int y = windowHh; y < height - windowHh; ++y) {
                    tbb::mutex::scoped_lock(bestDisparitiesMutex);
                    if(costs.element(y, x) < minCosts.element(y, x)) {
                        minCosts.element(y, x) = costs.element(y, x);
                        bestDisparities.element(y, x) = d;

                        //result.element(y,x) = (bestDisparities.element(y, x) / (double)width * 255 * 3);
                    }
                }
            }
            //BMPLoader().save("../../result.bmp", result);

            stats.endInterval("Comparing with previous minimum");
            collector.addStatistics(stats);

        }
    }, parallelDisp);
void RGB24Buffer::fillWithYUYV (uint8_t *yuyv)
{
    for (int i = 0; i < h; i++)
    {
        int j = 0;
#ifdef WITH_SSE
        const int span = SSEReader8BBBBBBBB_DDDDDDDD::BYTE_STEP / sizeof(RGBColor);
        /* Checking that we have a full span to read */
        for (; j + span <= w ; j += span)
        {
            FixedVector<Int16x8,4> r = SSEReader8BBBB_DDDD::read((uint32_t *)yuyv);

            Int16x8 cy1 = r[0] - Int16x8((uint16_t) 16);
            Int16x8 cu  = r[1] - Int16x8((uint16_t)128);
            Int16x8 cy2 = r[2] - Int16x8((uint16_t) 16);
            Int16x8 cv  = r[3] - Int16x8((uint16_t)128);

            Int16x8 con0  ((int16_t)0);
            Int16x8 con255((int16_t)0xFF);

            /* coefficients */

            /* This is a hack to fit into 16bit register */
            Int16x8 con100((uint16_t)(100 / 4));
            Int16x8 con128((uint16_t)(128 / 4));
            Int16x8 con208((uint16_t)(208 / 4));
            Int16x8 con298((uint16_t)(298 / 4));
            Int16x8 con516((uint16_t)(516 / 4));
            Int16x8 con409((uint16_t)(409 / 4));

            FixedVector<Int16x8, 8> result;
            enum {
                B1,
                G1,
                R1,
                ZERO1,
                B2,
                G2,
                R2,
                ZERO2
            };

            Int16x8 dr = con128               + con409 * cv;
            Int16x8 dg = con128 - con100 * cu - con208 * cv;
            Int16x8 db = con128 + con516 * cu              ;

            Int16x8 dy1 = con298 * cy1;
            Int16x8 dy2 = con298 * cy2;

            result[R1] = (dy1 + dr) >> 6;
            result[G1] = (dy1 + dg) >> 6;
            result[B1] = (dy1 + db) >> 6;
            result[ZERO1] = Int16x8((int16_t)0);

            result[R2] = (dy2 + dr) >> 6;
            result[G2] = (dy2 + dg) >> 6;
            result[B2] = (dy2 + db) >> 6;
            result[ZERO2] = Int16x8((int16_t)0);

#ifdef USE_NONUNROLLED_LOOP
            /* TODO: Use saturated arithmetics instead probably*/
            for (int k = B1; k < ZERO1; k++)
            {

                result[k] = SSEMath::selector(result[k] > con255, con255   , result[k]);
                result[k] = SSEMath::selector(result[k] > con0  , result[k], con0     );
                int k1 = k + B2;
                result[k1] = SSEMath::selector(result[k1] > con255, con255   , result[k1]);
                result[k1] = SSEMath::selector(result[k1] > con0  , result[k1], con0     );
            }
#else
            result[R1] = SSEMath::selector(result[R1] > con255, con255   , result[R1]);
            result[R1] = SSEMath::selector(result[R1] > con0  , result[R1], con0     );
            result[G1] = SSEMath::selector(result[G1] > con255, con255   , result[G1]);
            result[G1] = SSEMath::selector(result[G1] > con0  , result[G1], con0     );
            result[B1] = SSEMath::selector(result[B1] > con255, con255   , result[B1]);
            result[B1] = SSEMath::selector(result[B1] > con0  , result[B1], con0     );

            result[R2] = SSEMath::selector(result[R2] > con255, con255   , result[R2]);
            result[R2] = SSEMath::selector(result[R2] > con0  , result[R2], con0     );
            result[G2] = SSEMath::selector(result[G2] > con255, con255   , result[G2]);
            result[G2] = SSEMath::selector(result[G2] > con0  , result[G2], con0     );
            result[B2] = SSEMath::selector(result[B2] > con255, con255   , result[B2]);
            result[B2] = SSEMath::selector(result[B2] > con0  , result[B2], con0     );
#endif

            SSEReader8BBBBBBBB_DDDDDDDD::write(result, (uint32_t *)&element(i,j));
            yuyv += SSEReader8BBBB_DDDD::BYTE_STEP;
        }
#endif

        for (; j + 2 <= w; j+=2)
        {
            int y1 = yuyv[0];
            int u  = yuyv[1];
            int y2 = yuyv[2];
            int v  = yuyv[3];

            int cy1 = y1 -  16;
            int cu  = u  - 128;
            int cy2 = y2 -  16;
            int cv  = v  - 128;

            int r1 = ((298 * cy1            + 409 * cv + 128) >> 8);
            int g1 = ((298 * cy1 - 100 * cu - 208 * cv + 128) >> 8);
            int b1 = ((298 * cy1 + 516 * cu            + 128) >> 8);

            if (r1 > 255) r1 = 255;  if (r1 < 0) r1 = 0;
            if (g1 > 255) g1 = 255;  if (g1 < 0) g1 = 0;
            if (b1 > 255) b1 = 255;  if (b1 < 0) b1 = 0;

            int r2 = ((298 * cy2            + 409 * cv + 128) >> 8);
            int g2 = ((298 * cy2 - 100 * cu - 208 * cv + 128) >> 8);
            int b2 = ((298 * cy2 + 516 * cu            + 128) >> 8);

            if (r2 > 255) r2 = 255;  if (r2 < 0) r2 = 0;
            if (g2 > 255) g2 = 255;  if (g2 < 0) g2 = 0;
            if (b2 > 255) b2 = 255;  if (b2 < 0) b2 = 0;

            element(i,j) = RGBColor(r1,g1,b1);
            element(i,j + 1) = RGBColor(r2,g2,b2);
            yuyv += 4;
        }
    }
}