예제 #1
0
/**
 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
 * precise version of a box filter 4:2:2 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  do {
    if (width == 4) {
      const __m128i top = _mm_loadl_epi64((__m128i *)input);
      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
      _mm_storeh_epi32(pred_buf_m128i, sum);
    } else {
      const __m128i top = _mm_loadu_si128((__m128i *)input);
      if (width == 8) {
        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
        _mm_storel_epi64(pred_buf_m128i, sum);
      } else {
        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
        _mm_storeu_si128(pred_buf_m128i, sum);
        if (width == 32) {
          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
        }
      }
    }
    pred_buf_m128i += CFL_BUF_LINE_I128;
    input += input_stride;
  } while (pred_buf_m128i < end);
}
예제 #2
0
static void hor_add_sub_avx2(__m128i *row0, __m128i *row1){

  __m128i a = _mm_hadd_epi16(*row0, *row1);
  __m128i b = _mm_hsub_epi16(*row0, *row1);

  __m128i c = _mm_hadd_epi16(a, b);
  __m128i d = _mm_hsub_epi16(a, b);

  *row0 = _mm_hadd_epi16(c, d);
  *row1 = _mm_hsub_epi16(c, d);
}
예제 #3
0
void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst)
{
  __m128i a, b, c, d;
  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter));

  a = _mm_maddubs_epi16(*data01, fir);
  b = _mm_maddubs_epi16(*data23, fir);
  a = _mm_hadd_epi16(a, b);

  c = _mm_maddubs_epi16(*data45, fir);
  d = _mm_maddubs_epi16(*data67, fir);
  c = _mm_hadd_epi16(c, d);

  a = _mm_hadd_epi16(a, c);

  _mm_storeu_si128(dst, a);
}
예제 #4
0
/**
 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
  const int luma_stride = input_stride << 1;
  do {
    if (width == 4) {
      const __m128i top = _mm_loadl_epi64((__m128i *)input);
      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      sum = _mm_hadd_epi16(sum, sum);
      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
    } else {
      const __m128i top = _mm_loadu_si128((__m128i *)input);
      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      if (width == 8) {
        sum = _mm_hadd_epi16(sum, sum);
        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
      } else {
        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        const __m128i bot_1 =
            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
        if (width == 32) {
          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
          const __m128i bot_2 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
          const __m128i bot_3 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
                           _mm_add_epi16(next_sum, next_sum));
        }
      }
    }
    input += luma_stride;
  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
예제 #5
0
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur)
{

  __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
  __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur));

  __m128i diff_lo = _mm_sub_epi16(current, original);

  original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8)));
  current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8)));

  __m128i diff_hi = _mm_sub_epi16(current, original);


  //Hor
  __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi);
  __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi);

  __m128i row2 = _mm_hadd_epi16(row0, row1);
  __m128i row3 = _mm_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm_hadd_epi16(row2, row3);
  row1 = _mm_hsub_epi16(row2, row3);

  row2 = _mm_hadd_epi16(row0, row1);
  row3 = _mm_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm_abs_epi16(row2);
  row3 = _mm_abs_epi16(row3);

  row3 = _mm_add_epi16(row2, row3);

  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum = _mm_extract_epi16(row3, 0);
  unsigned satd = (sum + 1) >> 1;

  return satd;
}
예제 #6
0
static INLINE void ver_add_sub_avx2(__m128i (*temp_hor)[8], __m128i (*temp_ver)[8]){

  // First stage
  for (int i = 0; i < 8; i += 2){
    (*temp_ver)[i+0] = _mm_hadd_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]);
    (*temp_ver)[i+1] = _mm_hsub_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]);
  }

  // Second stage
  for (int i = 0; i < 8; i += 4){
    (*temp_hor)[i + 0] = _mm_add_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]);
    (*temp_hor)[i + 1] = _mm_add_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]);
    (*temp_hor)[i + 2] = _mm_sub_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]);
    (*temp_hor)[i + 3] = _mm_sub_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]);
  }

  // Third stage
  for (int i = 0; i < 4; ++i){
    (*temp_ver)[i + 0] = _mm_add_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]);
    (*temp_ver)[i + 4] = _mm_sub_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]);
  }
}
예제 #7
0
__m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_hadd_epi16
  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
  return _mm_hadd_epi16(a, b);
}
    void matrixCalc(void *inputs, void *outputs, long in_bitRate)
    {
        //Validate the input
        TML::Matrix i(inputs,0);
        if (i.dims() != 2)
            throw "Input should be a 2D matrix";

        if (!i.isChar())
            throw "Input should have character data";

        if (i.planes() != 4)
            throw "Input needs 4 planes";

        if (i.dim(0) % 64 != 0)
            throw "Width needs to be a multiple of 64";

        if (i.dim(1) % 2 != 0)
            throw "Height needs to be a multiple of 2";

        if (i.dim(0) != m_cfg.g_w || i.dim(1) != m_cfg.g_h || m_cfg.rc_target_bitrate != in_bitRate)
        {
            vpx_img_free(&m_raw);
            vpx_img_alloc(&m_raw, VPX_IMG_FMT_I420, i.dim(0), i.dim(1), 1);

            vpx_codec_destroy(&m_codec);

            vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &m_cfg, 0);

            m_cfg.rc_target_bitrate = in_bitRate;
            m_cfg.g_w = i.dim(0);
            m_cfg.g_h = i.dim(1);

            vpx_codec_enc_init(&m_codec, vpx_codec_vp8_cx(), &m_cfg, 0);
        }


        //ARGB -> YYYY U V
        int x;
        const int N = 32;

        const int Uoffset = i.dim(0)*i.dim(1);
        const int Voffset = Uoffset + Uoffset/4;
        const int w = i.dim(0);
        const int h = i.dim(1);
        const int sy = i.stride(1);
        unsigned char *data = (unsigned char*)i.data();
        int y;

        unsigned char *buffer = m_raw.planes[0];

        //RRRR
        __v16qi rShuffle = {1, -1,-1,-1,
                            5, -1,-1,-1,
                            9, -1,-1,-1,
                            13, -1,-1,-1
                           };

        __v16qi gShuffle = {2, -1,-1,-1,
                            6, -1,-1,-1,
                            10,-1,-1,-1,
                            14,-1,-1,-1
                           };

        __v16qi bShuffle = {3,-1,-1,-1,
                            7,-1,-1,-1,
                            11,-1,-1,-1,
                            15,-1,-1,-1
                           };

        //Shuffle so elements are moved to front/back
        __v16qi _aShuffle = {
            0, 4, 8, 12,
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            -1, -1, -1, -1
        };

        __v16qi _bShuffle = {
            -1, -1, -1, -1,
            0, 4, 8, 12,
            -1, -1, -1, -1,
            -1, -1, -1, -1
        };

        __v16qi _cShuffle = {
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            0, 4, 8, 12,
            -1, -1, -1, -1
        };

        __v16qi _dShuffle = {
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            0, 4, 8, 12
        };

        __v8hi R2Y = {	27,	27,	27,	27,	27,	27,	27,	27};
        __v8hi G2Y = {	91,	91,	91,	91,	91,	91,	91,	91};
        __v8hi B2Y = {	9,	9,	9,	9,	9,	9,	9,	9};

        __v8hi R2U = {	-12,-12,-12,-12,-12,-12,-12,-12};
        __v8hi G2U = {	-43,-43,-43,-43,-43,-43,-43,-43};
        __v8hi B2U = {	55,	55,	55,	55,	55,	55,	55,	55};

        __v8hi R2V = {	78,	78,	78,	78,	78,	78,	78,	78};
        __v8hi G2V = {	-71,-71,-71,-71,-71,-71,-71,-71};
        __v8hi B2V = {	-7,	-7,	-7,	-7,	-7,	-7,	-7,	-7};

        __v8hi m127 = {127,127,127,127,127,127,127,127};

        __v8hi zero = {	0,	0,	0,	0,	0,	0,	0,	0};
        __v8hi two55 = {255,255,255,255,255,255,255,255};

        for (y=0; y<h; y+=2)
        {
            for (x=0; x<w; x+=N)
            {
                __v8hi tY[N/8];
                __v8hi bY[N/8];

                __v8hi tU[N/8];
                __v8hi bU[N/8];

                __v8hi tV[N/8];
                __v8hi bV[N/8];

                //Step 1: Convert to YUV
                int n;
                for (n=0; n<N; n+=8)	//Read 8x per lane
                {
                    __v16qi tARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4));
                    __v16qi tARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + 16));
                    __v16qi bARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy));
                    __v16qi bARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy + 16));

                    // ARGB(1) ARGB(2) ARGB(3) ARGB(4) | ARGB(5) ARGB(6) ARGB(7) ARGB(8)
                    // => AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8)
                    __v16qi tARGBx2_15 = _mm_unpacklo_epi8(tARGBx4_l, tARGBx4_r);
                    __v16qi tARGBx2_26 = _mm_unpackhi_epi8(tARGBx4_l, tARGBx4_r);

                    __v16qi bARGBx2_15 = _mm_unpacklo_epi8(bARGBx4_l, bARGBx4_r);
                    __v16qi bARGBx2_26 = _mm_unpackhi_epi8(bARGBx4_l, bARGBx4_r);

                    // AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8)
                    // => AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8)
                    __v16qi tARGB_1357 = _mm_unpacklo_epi8(tARGBx2_15, tARGBx2_26);
                    __v16qi tARGB_2468 = _mm_unpackhi_epi8(tARGBx2_15, tARGBx2_26);

                    __v16qi bARGB_1357 = _mm_unpacklo_epi8(bARGBx2_15, bARGBx2_26);
                    __v16qi bARGB_2468 = _mm_unpackhi_epi8(bARGBx2_15, bARGBx2_26);

                    //AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8)
                    // => AAAAAAAARRRRRRRR | GGGGGGGGBBBBBBBB
                    __v16qi tAARR = _mm_unpacklo_epi8(tARGB_1357, tARGB_2468);
                    __v16qi tGGBB = _mm_unpackhi_epi8(tARGB_1357, tARGB_2468);

                    __v16qi bAARR = _mm_unpacklo_epi8(bARGB_1357, bARGB_2468);
                    __v16qi bGGBB = _mm_unpackhi_epi8(bARGB_1357, bARGB_2468);

                    //Unpack to 8 R's, 8 G's, and 8 B's.
                    __v8hi tRRRR = _mm_unpackhi_epi8(tAARR, zero);
                    __v8hi tGGGG = _mm_unpacklo_epi8(tGGBB, zero);
                    __v8hi tBBBB = _mm_unpackhi_epi8(tGGBB, zero);

                    __v8hi bRRRR = _mm_unpackhi_epi8(bAARR, zero);
                    __v8hi bGGGG = _mm_unpacklo_epi8(bGGBB, zero);
                    __v8hi bBBBB = _mm_unpackhi_epi8(bGGBB, zero);

                    //Convert to YUV (8x parallel)
                    __v8hi tYYYY = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2Y), _mm_mullo_epi16(tBBBB, B2Y)));
                    __v8hi tUUUU = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2U), _mm_mullo_epi16(tBBBB, B2U)));
                    __v8hi tVVVV = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2V), _mm_mullo_epi16(tBBBB, B2V)));

                    __v8hi bYYYY = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2Y), _mm_mullo_epi16(bBBBB, B2Y)));
                    __v8hi bUUUU = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2U), _mm_mullo_epi16(bBBBB, B2U)));
                    __v8hi bVVVV = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2V), _mm_mullo_epi16(bBBBB, B2V)));

                    tUUUU = _mm_add_epi16(_mm_srai_epi16(tUUUU, 7), m127);
                    tVVVV = _mm_add_epi16(_mm_srai_epi16(tVVVV, 7), m127);

                    bUUUU = _mm_add_epi16(_mm_srai_epi16(bUUUU, 7), m127);
                    bVVVV = _mm_add_epi16(_mm_srai_epi16(bVVVV, 7), m127);

                    //Remove the fractional portion and clamp in 0...255
                    tY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(tYYYY,zero), 7), two55);
                    tU[n/8] = _mm_min_epi16(_mm_max_epi16(tUUUU,zero), two55);
                    tV[n/8] = _mm_min_epi16(_mm_max_epi16(tVVVV,zero), two55);

                    bY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(bYYYY,zero), 7), two55);
                    bU[n/8] = _mm_min_epi16(_mm_max_epi16(bUUUU,zero), two55);
                    bV[n/8] = _mm_min_epi16(_mm_max_epi16(bVVVV,zero), two55);
                }


                // Step 2 - Write out Luma (part 1)
                for (n=0; n<N; n+=16)
                {
                    __v8hi A = tY[n/8];
                    __v8hi B = tY[n/8+1];

                    __m128i Y = _mm_packus_epi16(A,B);

                    _mm_storeu_si128((__m128i*)(buffer+y*w+x+n), Y);
                }

                for (n=0; n<N; n+=16)
                {
                    __v8hi A = bY[n/8];
                    __v8hi B = bY[n/8+1];

                    __m128i Y = _mm_packus_epi16(A,B);

                    _mm_storeu_si128((__m128i*)(buffer+y*w+x+n+w), Y);
                }

                //Step 3 -- U and V data...
                for (n=0; n<N; n+=32)
                {
                    __m128i U16a = _mm_add_epi16(tU[n/8], bU[n/8]);
                    __m128i U16b = _mm_add_epi16(tU[n/8+1], bU[n/8+1]);
                    __m128i U16c = _mm_add_epi16(tU[n/8+2], bU[n/8+2]);
                    __m128i U16d = _mm_add_epi16(tU[n/8+3], bU[n/8+3]);

                    U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2);
                    U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2);

                    __m128i U = _mm_packus_epi16(U16a, U16c);

                    _mm_storeu_si128((__m128i*)(buffer+Uoffset+y/2*w/2 + x/2+n/2), U);
                }

                for (n=0; n<N; n+=32)
                {
                    __m128i U16a = _mm_add_epi16(tV[n/8], bV[n/8]);
                    __m128i U16b = _mm_add_epi16(tV[n/8+1], bV[n/8+1]);
                    __m128i U16c = _mm_add_epi16(tV[n/8+2], bV[n/8+2]);
                    __m128i U16d = _mm_add_epi16(tV[n/8+3], bV[n/8+3]);

                    U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2);
                    U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2);

                    __m128i U = _mm_packus_epi16(U16a, U16c);

                    _mm_storeu_si128((__m128i*)(buffer+Voffset+y/2*w/2 + x/2+n/2), U);
                }
            }
        }

        m_frameCnt++;
        vpx_codec_encode(&m_codec, &m_raw, m_frameCnt, 1, 0, VPX_DL_REALTIME);
        vpx_codec_iter_t iter = NULL;

        const vpx_codec_cx_pkt_t *pkt;

        while ((pkt = vpx_codec_get_cx_data(&m_codec, &iter)))
        {
            if (pkt->kind == VPX_CODEC_CX_FRAME_PKT)
            {

                //Generate output
                TML::Matrix o(outputs,0);

                _jit_matrix_info m;
                memset(&m, 0, sizeof(m));

                m.dimcount = 2;
                m.dim[0] = pkt->data.frame.sz;
                m.dim[1] = 1;
                m.dimstride[0] = pkt->data.frame.sz;
                m.dimstride[1] = 1;
                m.planecount = 1;
                m.type = _jit_sym_char;

                o.resizeTo(&m);

                memcpy(o.data(), pkt->data.frame.buf, pkt->data.frame.sz);

                break;
            }
        }

    }
void ThumbnailProvider::shrinkGrayscale4x4SSE(const Image& srcImage, Thumbnail::ThumbnailImageGrayscale& destImage)
{
  union
  {
    __m128i a;
    long long b[2];
  } splitter;

  const int scaleFactor = 4;
  const int width = srcImage.width;
  const int height = srcImage.height;

  ASSERT(width % scaleFactor == 0);
  ASSERT(height % scaleFactor == 0);

  destImage.setResolution(width / scaleFactor, height / scaleFactor);

  const unsigned char offset = offsetof(Image::Pixel, y);
  unsigned char mask[16] =
  {
    0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF,
    0xFF, 0xFF, 0xFF, 0xFF
  };
  mask[0] = offset;
  mask[1] = offset + 4;
  mask[2] = offset + 8;
  mask[3] = offset + 12;
  const __m128i mMask = _mm_loadu_si128(reinterpret_cast<__m128i*>(mask));

  const __m128i zero = _mm_setzero_si128();
  const int summsSize = destImage.width * 16;
  __m128i* summs = reinterpret_cast<__m128i*>(SystemCall::alignedMalloc(summsSize, 16));
  memset(summs, 0, summsSize);

  const Image::Pixel* pSrc;
  Thumbnail::ThumbnailImageGrayscale::PixelType* pDest;
  __m128i* pSumms;

  __m128i p0;
  __m128i p1;
  __m128i p2;
  __m128i p3;

  for(int y = 0; y < height; ++y)
  {
    if(y % scaleFactor == 0)
    {
      pDest = destImage[y / scaleFactor];
    }
    pSrc = srcImage[y];
    pSumms = summs;
    for(int x = 0; x < width; x += 8, pSrc += 8, ++pSumms)
    {
      p0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc));
      p1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 4));

      p0 = SHUFFLE(p0, mMask); // y0 y1 y2 y3 0 0 0 0 0 0 0 0 0 0 0 0
      p1 = SHUFFLE(p1, mMask); // y4 y5 y6 y7 0 0 0 0 0 0 0 0 0 0 0 0

      p0 = _mm_unpacklo_epi32(p0, p1); // y0 y1 y2 y3 y4 y5 y6 y7 0 0 0 0 0 0 0 0
      p0 = _mm_unpacklo_epi8(p0, zero); // y0 y1 y2 y3 y4 y5 y6 y7
      *pSumms = _mm_add_epi16(*pSumms, p0);
    }

    if(y % scaleFactor == scaleFactor - 1)
    {
      pSumms = summs;
      for (int i = 0; i < destImage.width; i += 8, pSumms += 4, pDest += 8)
      {
        p0 = *pSumms;
        p1 = *(pSumms + 1);
        p2 = *(pSumms + 2);
        p3 = *(pSumms + 3);

        p0 = _mm_hadd_epi16(p0, p1);
        p1 = _mm_hadd_epi16(p2, p3);
        p0 = _mm_hadd_epi16(p0, p1);
        p0 = _mm_srli_epi16(p0, 4);
        p0 = _mm_packus_epi16(p0, zero);
        splitter.a = p0;
        *reinterpret_cast<long long*>(pDest) = splitter.b[0];
      }
      memset(summs, 0, summsSize);
    }
  }
  SystemCall::alignedFree(summs);
}