/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storeh_epi32(pred_buf_m128i, sum); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); if (width == 8) { const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storel_epi64(pred_buf_m128i, sum); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } } pred_buf_m128i += CFL_BUF_LINE_I128; input += input_stride; } while (pred_buf_m128i < end); }
static void hor_add_sub_avx2(__m128i *row0, __m128i *row1){ __m128i a = _mm_hadd_epi16(*row0, *row1); __m128i b = _mm_hsub_epi16(*row0, *row1); __m128i c = _mm_hadd_epi16(a, b); __m128i d = _mm_hsub_epi16(a, b); *row0 = _mm_hadd_epi16(c, d); *row1 = _mm_hsub_epi16(c, d); }
void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst) { __m128i a, b, c, d; __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter)); a = _mm_maddubs_epi16(*data01, fir); b = _mm_maddubs_epi16(*data23, fir); a = _mm_hadd_epi16(a, b); c = _mm_maddubs_epi16(*data45, fir); d = _mm_maddubs_epi16(*data67, fir); c = _mm_hadd_epi16(c, d); a = _mm_hadd_epi16(a, c); _mm_storeu_si128(dst, a); }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); sum = _mm_hadd_epi16(sum, sum); *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); if (width == 8) { sum = _mm_hadd_epi16(sum, sum); _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i bot_2 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i bot_3 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, _mm_add_epi16(next_sum, next_sum)); } } } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
static INLINE void ver_add_sub_avx2(__m128i (*temp_hor)[8], __m128i (*temp_ver)[8]){ // First stage for (int i = 0; i < 8; i += 2){ (*temp_ver)[i+0] = _mm_hadd_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]); (*temp_ver)[i+1] = _mm_hsub_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]); } // Second stage for (int i = 0; i < 8; i += 4){ (*temp_hor)[i + 0] = _mm_add_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]); (*temp_hor)[i + 1] = _mm_add_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]); (*temp_hor)[i + 2] = _mm_sub_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]); (*temp_hor)[i + 3] = _mm_sub_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]); } // Third stage for (int i = 0; i < 4; ++i){ (*temp_ver)[i + 0] = _mm_add_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]); (*temp_ver)[i + 4] = _mm_sub_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]); } }
__m128i test_mm_hadd_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadd_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadd_epi16(a, b); }
void matrixCalc(void *inputs, void *outputs, long in_bitRate) { //Validate the input TML::Matrix i(inputs,0); if (i.dims() != 2) throw "Input should be a 2D matrix"; if (!i.isChar()) throw "Input should have character data"; if (i.planes() != 4) throw "Input needs 4 planes"; if (i.dim(0) % 64 != 0) throw "Width needs to be a multiple of 64"; if (i.dim(1) % 2 != 0) throw "Height needs to be a multiple of 2"; if (i.dim(0) != m_cfg.g_w || i.dim(1) != m_cfg.g_h || m_cfg.rc_target_bitrate != in_bitRate) { vpx_img_free(&m_raw); vpx_img_alloc(&m_raw, VPX_IMG_FMT_I420, i.dim(0), i.dim(1), 1); vpx_codec_destroy(&m_codec); vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &m_cfg, 0); m_cfg.rc_target_bitrate = in_bitRate; m_cfg.g_w = i.dim(0); m_cfg.g_h = i.dim(1); vpx_codec_enc_init(&m_codec, vpx_codec_vp8_cx(), &m_cfg, 0); } //ARGB -> YYYY U V int x; const int N = 32; const int Uoffset = i.dim(0)*i.dim(1); const int Voffset = Uoffset + Uoffset/4; const int w = i.dim(0); const int h = i.dim(1); const int sy = i.stride(1); unsigned char *data = (unsigned char*)i.data(); int y; unsigned char *buffer = m_raw.planes[0]; //RRRR __v16qi rShuffle = {1, -1,-1,-1, 5, -1,-1,-1, 9, -1,-1,-1, 13, -1,-1,-1 }; __v16qi gShuffle = {2, -1,-1,-1, 6, -1,-1,-1, 10,-1,-1,-1, 14,-1,-1,-1 }; __v16qi bShuffle = {3,-1,-1,-1, 7,-1,-1,-1, 11,-1,-1,-1, 15,-1,-1,-1 }; //Shuffle so elements are moved to front/back __v16qi _aShuffle = { 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; __v16qi _bShuffle = { -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }; __v16qi _cShuffle = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1 }; __v16qi _dShuffle = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12 }; __v8hi R2Y = { 27, 27, 27, 27, 27, 27, 27, 27}; __v8hi G2Y = { 91, 91, 91, 91, 91, 91, 91, 91}; __v8hi B2Y = { 9, 9, 9, 9, 9, 9, 9, 9}; __v8hi R2U = { -12,-12,-12,-12,-12,-12,-12,-12}; __v8hi G2U = { -43,-43,-43,-43,-43,-43,-43,-43}; __v8hi B2U = { 55, 55, 55, 55, 55, 55, 55, 55}; __v8hi R2V = { 78, 78, 78, 78, 78, 78, 78, 78}; __v8hi G2V = { -71,-71,-71,-71,-71,-71,-71,-71}; __v8hi B2V = { -7, -7, -7, -7, -7, -7, -7, -7}; __v8hi m127 = {127,127,127,127,127,127,127,127}; __v8hi zero = { 0, 0, 0, 0, 0, 0, 0, 0}; __v8hi two55 = {255,255,255,255,255,255,255,255}; for (y=0; y<h; y+=2) { for (x=0; x<w; x+=N) { __v8hi tY[N/8]; __v8hi bY[N/8]; __v8hi tU[N/8]; __v8hi bU[N/8]; __v8hi tV[N/8]; __v8hi bV[N/8]; //Step 1: Convert to YUV int n; for (n=0; n<N; n+=8) //Read 8x per lane { __v16qi tARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4)); __v16qi tARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + 16)); __v16qi bARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy)); __v16qi bARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy + 16)); // ARGB(1) ARGB(2) ARGB(3) ARGB(4) | ARGB(5) ARGB(6) ARGB(7) ARGB(8) // => AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8) __v16qi tARGBx2_15 = _mm_unpacklo_epi8(tARGBx4_l, tARGBx4_r); __v16qi tARGBx2_26 = _mm_unpackhi_epi8(tARGBx4_l, tARGBx4_r); __v16qi bARGBx2_15 = _mm_unpacklo_epi8(bARGBx4_l, bARGBx4_r); __v16qi bARGBx2_26 = _mm_unpackhi_epi8(bARGBx4_l, bARGBx4_r); // AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8) // => AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8) __v16qi tARGB_1357 = _mm_unpacklo_epi8(tARGBx2_15, tARGBx2_26); __v16qi tARGB_2468 = _mm_unpackhi_epi8(tARGBx2_15, tARGBx2_26); __v16qi bARGB_1357 = _mm_unpacklo_epi8(bARGBx2_15, bARGBx2_26); __v16qi bARGB_2468 = _mm_unpackhi_epi8(bARGBx2_15, bARGBx2_26); //AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8) // => AAAAAAAARRRRRRRR | GGGGGGGGBBBBBBBB __v16qi tAARR = _mm_unpacklo_epi8(tARGB_1357, tARGB_2468); __v16qi tGGBB = _mm_unpackhi_epi8(tARGB_1357, tARGB_2468); __v16qi bAARR = _mm_unpacklo_epi8(bARGB_1357, bARGB_2468); __v16qi bGGBB = _mm_unpackhi_epi8(bARGB_1357, bARGB_2468); //Unpack to 8 R's, 8 G's, and 8 B's. __v8hi tRRRR = _mm_unpackhi_epi8(tAARR, zero); __v8hi tGGGG = _mm_unpacklo_epi8(tGGBB, zero); __v8hi tBBBB = _mm_unpackhi_epi8(tGGBB, zero); __v8hi bRRRR = _mm_unpackhi_epi8(bAARR, zero); __v8hi bGGGG = _mm_unpacklo_epi8(bGGBB, zero); __v8hi bBBBB = _mm_unpackhi_epi8(bGGBB, zero); //Convert to YUV (8x parallel) __v8hi tYYYY = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2Y), _mm_mullo_epi16(tBBBB, B2Y))); __v8hi tUUUU = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2U), _mm_mullo_epi16(tBBBB, B2U))); __v8hi tVVVV = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2V), _mm_mullo_epi16(tBBBB, B2V))); __v8hi bYYYY = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2Y), _mm_mullo_epi16(bBBBB, B2Y))); __v8hi bUUUU = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2U), _mm_mullo_epi16(bBBBB, B2U))); __v8hi bVVVV = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2V), _mm_mullo_epi16(bBBBB, B2V))); tUUUU = _mm_add_epi16(_mm_srai_epi16(tUUUU, 7), m127); tVVVV = _mm_add_epi16(_mm_srai_epi16(tVVVV, 7), m127); bUUUU = _mm_add_epi16(_mm_srai_epi16(bUUUU, 7), m127); bVVVV = _mm_add_epi16(_mm_srai_epi16(bVVVV, 7), m127); //Remove the fractional portion and clamp in 0...255 tY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(tYYYY,zero), 7), two55); tU[n/8] = _mm_min_epi16(_mm_max_epi16(tUUUU,zero), two55); tV[n/8] = _mm_min_epi16(_mm_max_epi16(tVVVV,zero), two55); bY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(bYYYY,zero), 7), two55); bU[n/8] = _mm_min_epi16(_mm_max_epi16(bUUUU,zero), two55); bV[n/8] = _mm_min_epi16(_mm_max_epi16(bVVVV,zero), two55); } // Step 2 - Write out Luma (part 1) for (n=0; n<N; n+=16) { __v8hi A = tY[n/8]; __v8hi B = tY[n/8+1]; __m128i Y = _mm_packus_epi16(A,B); _mm_storeu_si128((__m128i*)(buffer+y*w+x+n), Y); } for (n=0; n<N; n+=16) { __v8hi A = bY[n/8]; __v8hi B = bY[n/8+1]; __m128i Y = _mm_packus_epi16(A,B); _mm_storeu_si128((__m128i*)(buffer+y*w+x+n+w), Y); } //Step 3 -- U and V data... for (n=0; n<N; n+=32) { __m128i U16a = _mm_add_epi16(tU[n/8], bU[n/8]); __m128i U16b = _mm_add_epi16(tU[n/8+1], bU[n/8+1]); __m128i U16c = _mm_add_epi16(tU[n/8+2], bU[n/8+2]); __m128i U16d = _mm_add_epi16(tU[n/8+3], bU[n/8+3]); U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2); U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2); __m128i U = _mm_packus_epi16(U16a, U16c); _mm_storeu_si128((__m128i*)(buffer+Uoffset+y/2*w/2 + x/2+n/2), U); } for (n=0; n<N; n+=32) { __m128i U16a = _mm_add_epi16(tV[n/8], bV[n/8]); __m128i U16b = _mm_add_epi16(tV[n/8+1], bV[n/8+1]); __m128i U16c = _mm_add_epi16(tV[n/8+2], bV[n/8+2]); __m128i U16d = _mm_add_epi16(tV[n/8+3], bV[n/8+3]); U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2); U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2); __m128i U = _mm_packus_epi16(U16a, U16c); _mm_storeu_si128((__m128i*)(buffer+Voffset+y/2*w/2 + x/2+n/2), U); } } } m_frameCnt++; vpx_codec_encode(&m_codec, &m_raw, m_frameCnt, 1, 0, VPX_DL_REALTIME); vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; while ((pkt = vpx_codec_get_cx_data(&m_codec, &iter))) { if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { //Generate output TML::Matrix o(outputs,0); _jit_matrix_info m; memset(&m, 0, sizeof(m)); m.dimcount = 2; m.dim[0] = pkt->data.frame.sz; m.dim[1] = 1; m.dimstride[0] = pkt->data.frame.sz; m.dimstride[1] = 1; m.planecount = 1; m.type = _jit_sym_char; o.resizeTo(&m); memcpy(o.data(), pkt->data.frame.buf, pkt->data.frame.sz); break; } } }
void ThumbnailProvider::shrinkGrayscale4x4SSE(const Image& srcImage, Thumbnail::ThumbnailImageGrayscale& destImage) { union { __m128i a; long long b[2]; } splitter; const int scaleFactor = 4; const int width = srcImage.width; const int height = srcImage.height; ASSERT(width % scaleFactor == 0); ASSERT(height % scaleFactor == 0); destImage.setResolution(width / scaleFactor, height / scaleFactor); const unsigned char offset = offsetof(Image::Pixel, y); unsigned char mask[16] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; mask[0] = offset; mask[1] = offset + 4; mask[2] = offset + 8; mask[3] = offset + 12; const __m128i mMask = _mm_loadu_si128(reinterpret_cast<__m128i*>(mask)); const __m128i zero = _mm_setzero_si128(); const int summsSize = destImage.width * 16; __m128i* summs = reinterpret_cast<__m128i*>(SystemCall::alignedMalloc(summsSize, 16)); memset(summs, 0, summsSize); const Image::Pixel* pSrc; Thumbnail::ThumbnailImageGrayscale::PixelType* pDest; __m128i* pSumms; __m128i p0; __m128i p1; __m128i p2; __m128i p3; for(int y = 0; y < height; ++y) { if(y % scaleFactor == 0) { pDest = destImage[y / scaleFactor]; } pSrc = srcImage[y]; pSumms = summs; for(int x = 0; x < width; x += 8, pSrc += 8, ++pSumms) { p0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc)); p1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 4)); p0 = SHUFFLE(p0, mMask); // y0 y1 y2 y3 0 0 0 0 0 0 0 0 0 0 0 0 p1 = SHUFFLE(p1, mMask); // y4 y5 y6 y7 0 0 0 0 0 0 0 0 0 0 0 0 p0 = _mm_unpacklo_epi32(p0, p1); // y0 y1 y2 y3 y4 y5 y6 y7 0 0 0 0 0 0 0 0 p0 = _mm_unpacklo_epi8(p0, zero); // y0 y1 y2 y3 y4 y5 y6 y7 *pSumms = _mm_add_epi16(*pSumms, p0); } if(y % scaleFactor == scaleFactor - 1) { pSumms = summs; for (int i = 0; i < destImage.width; i += 8, pSumms += 4, pDest += 8) { p0 = *pSumms; p1 = *(pSumms + 1); p2 = *(pSumms + 2); p3 = *(pSumms + 3); p0 = _mm_hadd_epi16(p0, p1); p1 = _mm_hadd_epi16(p2, p3); p0 = _mm_hadd_epi16(p0, p1); p0 = _mm_srli_epi16(p0, 4); p0 = _mm_packus_epi16(p0, zero); splitter.a = p0; *reinterpret_cast<long long*>(pDest) = splitter.b[0]; } memset(summs, 0, summsSize); } } SystemCall::alignedFree(summs); }