void FAST_t(InputArray _img, std::vector<KeyPoint>& keypoints, int threshold, bool nonmax_suppression) { Mat img = _img.getMat(); const int K = patternSize/2, N = patternSize + K + 1; #if CV_SSE2 const int quarterPatternSize = patternSize/4; (void)quarterPatternSize; #endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); keypoints.clear(); threshold = std::min(std::max(threshold, 0), 255); #if CV_SSE2 __m128i delta = _mm_set1_epi8(-128), t = _mm_set1_epi8((char)threshold), K16 = _mm_set1_epi8((char)K); (void)K16; (void)delta; (void)t; #endif uchar threshold_tab[512]; for( i = -255; i <= 255; i++ ) threshold_tab[i+255] = (uchar)(i < -threshold ? 1 : i > threshold ? 2 : 0); AutoBuffer<uchar> _buf((img.cols+16)*3*(sizeof(int) + sizeof(uchar)) + 128); uchar* buf[3]; buf[0] = _buf; buf[1] = buf[0] + img.cols; buf[2] = buf[1] + img.cols; int* cpbuf[3]; cpbuf[0] = (int*)alignPtr(buf[2] + img.cols, sizeof(int)) + 1; cpbuf[1] = cpbuf[0] + img.cols + 1; cpbuf[2] = cpbuf[1] + img.cols + 1; memset(buf[0], 0, img.cols*3); for(i = 3; i < img.rows-2; i++) { const uchar* ptr = img.ptr<uchar>(i) + 3; uchar* curr = buf[(i - 3)%3]; int* cornerpos = cpbuf[(i - 3)%3]; memset(curr, 0, img.cols); int ncorners = 0; if( i < img.rows - 3 ) { j = 3; #if CV_SSE2 if( patternSize == 16 ) { for(; j < img.cols - 16 - 3; j += 16, ptr += 16) { __m128i m0, m1; __m128i v0 = _mm_loadu_si128((const __m128i*)ptr); __m128i v1 = _mm_xor_si128(_mm_subs_epu8(v0, t), delta); v0 = _mm_xor_si128(_mm_adds_epu8(v0, t), delta); __m128i x0 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[0])), delta); __m128i x1 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[quarterPatternSize])), delta); __m128i x2 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[2*quarterPatternSize])), delta); __m128i x3 = _mm_sub_epi8(_mm_loadu_si128((const __m128i*)(ptr + pixel[3*quarterPatternSize])), delta); m0 = _mm_and_si128(_mm_cmpgt_epi8(x0, v0), _mm_cmpgt_epi8(x1, v0)); m1 = _mm_and_si128(_mm_cmpgt_epi8(v1, x0), _mm_cmpgt_epi8(v1, x1)); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x1, v0), _mm_cmpgt_epi8(x2, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x1), _mm_cmpgt_epi8(v1, x2))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x2, v0), _mm_cmpgt_epi8(x3, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x2), _mm_cmpgt_epi8(v1, x3))); m0 = _mm_or_si128(m0, _mm_and_si128(_mm_cmpgt_epi8(x3, v0), _mm_cmpgt_epi8(x0, v0))); m1 = _mm_or_si128(m1, _mm_and_si128(_mm_cmpgt_epi8(v1, x3), _mm_cmpgt_epi8(v1, x0))); m0 = _mm_or_si128(m0, m1); int mask = _mm_movemask_epi8(m0); if( mask == 0 ) continue; if( (mask & 255) == 0 ) { j -= 8; ptr -= 8; continue; } __m128i c0 = _mm_setzero_si128(), c1 = c0, max0 = c0, max1 = c0; for( k = 0; k < N; k++ ) { __m128i x = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(ptr + pixel[k])), delta); m0 = _mm_cmpgt_epi8(x, v0); m1 = _mm_cmpgt_epi8(v1, x); c0 = _mm_and_si128(_mm_sub_epi8(c0, m0), m0); c1 = _mm_and_si128(_mm_sub_epi8(c1, m1), m1); max0 = _mm_max_epu8(max0, c0); max1 = _mm_max_epu8(max1, c1); } max0 = _mm_max_epu8(max0, max1); int m = _mm_movemask_epi8(_mm_cmpgt_epi8(max0, K16)); for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) if(m & 1) { cornerpos[ncorners++] = j+k; if(nonmax_suppression) curr[j+k] = (uchar)cornerScore<patternSize>(ptr+k, pixel, threshold); } } } #endif for( ; j < img.cols - 3; j++, ptr++ ) { int v = ptr[0]; const uchar* tab = &threshold_tab[0] - v + 255; int d = tab[ptr[pixel[0]]] | tab[ptr[pixel[8]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[2]]] | tab[ptr[pixel[10]]]; d &= tab[ptr[pixel[4]]] | tab[ptr[pixel[12]]]; d &= tab[ptr[pixel[6]]] | tab[ptr[pixel[14]]]; if( d == 0 ) continue; d &= tab[ptr[pixel[1]]] | tab[ptr[pixel[9]]]; d &= tab[ptr[pixel[3]]] | tab[ptr[pixel[11]]]; d &= tab[ptr[pixel[5]]] | tab[ptr[pixel[13]]]; d &= tab[ptr[pixel[7]]] | tab[ptr[pixel[15]]]; if( d & 1 ) { int vt = v - threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x < vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } if( d & 2 ) { int vt = v + threshold, count = 0; for( k = 0; k < N; k++ ) { int x = ptr[pixel[k]]; if(x > vt) { if( ++count > K ) { cornerpos[ncorners++] = j; if(nonmax_suppression) curr[j] = (uchar)cornerScore<patternSize>(ptr, pixel, threshold); break; } } else count = 0; } } } } cornerpos[-1] = ncorners; if( i == 3 ) continue; const uchar* prev = buf[(i - 4 + 3)%3]; const uchar* pprev = buf[(i - 5 + 3)%3]; cornerpos = cpbuf[(i - 4 + 3)%3]; ncorners = cornerpos[-1]; for( k = 0; k < ncorners; k++ ) { j = cornerpos[k]; int score = prev[j]; if( !nonmax_suppression || (score > prev[j+1] && score > prev[j-1] && score > pprev[j-1] && score > pprev[j] && score > pprev[j+1] && score > curr[j-1] && score > curr[j] && score > curr[j+1]) ) { keypoints.push_back(KeyPoint((float)j, (float)(i-1), 7.f, -1, (float)score)); } } }
static uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { int r, c; const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc_q = _mm_setzero_si128(); for (r = 0; r < size; r += 8) { __m128i v_acc_d = _mm_setzero_si128(); for (c = 0; c < size; c += 8) { const int16_t *b = src + c; const __m128i v_val_0_w = _mm_load_si128((const __m128i *)(b + 0 * stride)); const __m128i v_val_1_w = _mm_load_si128((const __m128i *)(b + 1 * stride)); const __m128i v_val_2_w = _mm_load_si128((const __m128i *)(b + 2 * stride)); const __m128i v_val_3_w = _mm_load_si128((const __m128i *)(b + 3 * stride)); const __m128i v_val_4_w = _mm_load_si128((const __m128i *)(b + 4 * stride)); const __m128i v_val_5_w = _mm_load_si128((const __m128i *)(b + 5 * stride)); const __m128i v_val_6_w = _mm_load_si128((const __m128i *)(b + 6 * stride)); const __m128i v_val_7_w = _mm_load_si128((const __m128i *)(b + 7 * stride)); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); } v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); src += 8 * stride; } v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc_q); return tmp; } #endif }
int bit_vec_filter_m128_sse11(uint8_t *read_vec, uint8_t *ref_vec, int length, int max_error) { const __m128i zero_mask = _mm_set1_epi8(0x00); const __m128i one_mask = _mm_set1_epi8(0xff); int total_byte = (length - 1) / BYTE_BASE_NUM11 + 1; int total_difference = 0; //Start iteration int i, j; //read data __m128i prev_read_XMM = _mm_set1_epi8(0x0); __m128i curr_read_XMM = *((__m128i *) (read_vec)); //ref data __m128i prev_ref_XMM = _mm_set1_epi8(0x0); __m128i curr_ref_XMM = *((__m128i *) (ref_vec)); __m128i read_XMM; __m128i ref_XMM; __m128i temp_diff_XMM; __m128i diff_XMM; __m128i mask; for (i = 0; i < total_byte; i += SSE_BYTE_NUM) { curr_read_XMM = *((__m128i *) (read_vec + i)); curr_ref_XMM = *((__m128i *) (ref_vec + i)); diff_XMM = _mm_xor_si128(curr_read_XMM, curr_ref_XMM); diff_XMM = xor11complement_sse(diff_XMM); if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); diff_XMM = _mm_and_si128(mask, diff_XMM); } } for (j = 1; j <= max_error; j++) { //Right shift read read_XMM = shift_right_sse11(prev_read_XMM, curr_read_XMM, j); temp_diff_XMM = _mm_xor_si128(read_XMM, curr_ref_XMM); temp_diff_XMM = xor11complement_sse(temp_diff_XMM); if (i == 0) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } } diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //Right shift ref ref_XMM = shift_right_sse11(prev_ref_XMM, curr_ref_XMM, j); temp_diff_XMM = _mm_xor_si128(curr_read_XMM, ref_XMM); temp_diff_XMM = xor11complement_sse(temp_diff_XMM); if (i == 0) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG11 + (j - 1) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } if (i + SSE_BYTE_NUM >= total_byte) { if (length % SSE_BASE_NUM11) { mask = _mm_load_si128( (__m128i *) (MASK_SSE_END11 + (length % SSE_BASE_NUM11) * SSE_BYTE_NUM)); temp_diff_XMM = _mm_and_si128(mask, temp_diff_XMM); } } diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); } total_difference += popcount11_m128i_sse(diff_XMM); prev_read_XMM = curr_read_XMM; prev_ref_XMM = curr_ref_XMM; if (total_difference > max_error) return 0; } return 1; }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
void flip_false_zero(__m128i& vec) { // printf("vec: \t\t"); // print128_bit(vec); //For not crossing bits __m128i *boundary= (__m128i *) MASK_7F; // printf("MASK_7F: \t"); // print128_bit(*boundary); __m128i shift = _mm_and_si128(*boundary, vec); // printf("After and: \t"); // print128_bit(shift); __m128i *mask = (__m128i *) MASK_0TO1; shift = _mm_shuffle_epi8(*mask, shift); vec = _mm_or_si128(vec, shift); // printf("Last cases %d: \t", 0); // print128_bit(vec); int i; for (i = 1; i < 4; i++) { shift = _mm_srli_epi16(vec, i); shift = _mm_and_si128(*boundary, shift); // printf("shift %d: \t", i); // print128_bit(shift); shift = _mm_shuffle_epi8(*mask, shift); // printf("shuffle %d: \t", i); // print128_bit(shift); shift = _mm_slli_epi16(shift, i); vec = _mm_or_si128(vec, shift); // printf("Last cases %d: \t", i); // print128_bit(vec); } //For the crossing bits __m128i shifted_vec = shift_right_sse1(vec, 4); // printf("shifted_vec: \t"); // print128_bit(shifted_vec); shift = _mm_and_si128(*boundary, shifted_vec); // printf("After and: \t"); // print128_bit(shift); shift = _mm_shuffle_epi8(*mask, shift); shifted_vec = _mm_or_si128(shifted_vec, shift); // printf("Cross cases %d: \t", 0); // print128_bit(shifted_vec); for (i = 1; i < 4; i++) { shift = _mm_srli_epi16(shifted_vec, i); shift = _mm_and_si128(*boundary, shift); shift = _mm_shuffle_epi8(*mask, shift); shift = _mm_slli_epi16(shift, i); shifted_vec = _mm_or_si128(shifted_vec, shift); // printf("Cross cases %d: \t", i); // print128_bit(shifted_vec); } shifted_vec = shift_left_sse1(shifted_vec, 4); vec = _mm_or_si128(shifted_vec, vec); // printf("Final case: \t"); // print128_bit(vec); }
int bit_vec_filter_no_flipping_m128_sse1(uint8_t *read_vec0, uint8_t *read_vec1, uint8_t *ref_vec0, uint8_t *ref_vec1, __m128i mask, int max_error) { int total_difference = 0; //Start iteration int j; //read data __m128i read_XMM0 = *((__m128i *) (read_vec0)); __m128i read_XMM1 = *((__m128i *) (read_vec1)); //ref data __m128i ref_XMM0 = *((__m128i *) (ref_vec0)); __m128i ref_XMM1 = *((__m128i *) (ref_vec1)); __m128i shift_XMM; __m128i diff_XMM; __m128i temp_diff_XMM; __m128i temp_shift_XMM; __m128i temp_mask; diff_XMM = _mm_xor_si128(read_XMM0, ref_XMM0); temp_diff_XMM = _mm_xor_si128(read_XMM1, ref_XMM1); diff_XMM = _mm_or_si128(diff_XMM, temp_diff_XMM); //printf("diff_XMM: \n"); //print128_bit_twice(diff_XMM); for (j = 1; j <= max_error; j++) { temp_mask = _mm_load_si128( (__m128i *) (MASK_SSE_BEG1 + (j - 1) * SSE_BYTE_NUM)); temp_mask = _mm_and_si128(temp_mask, mask); //Right shift read shift_XMM = shift_right_sse1(read_XMM0, j); temp_diff_XMM = _mm_xor_si128(shift_XMM, ref_XMM0); shift_XMM = shift_right_sse1(read_XMM1, j); temp_shift_XMM = _mm_xor_si128(shift_XMM, ref_XMM1); temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM); temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask); // printf("Before flip: \t"); // print128_bit(temp_diff_XMM); // flip_false_zero(temp_diff_XMM); //No flipping // printf("After flip: \t"); // print128_bit(temp_diff_XMM); diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //printf("read shift %d diff_XMM: \n", j); //print128_bit_twice(diff_XMM); //Right shift ref shift_XMM = shift_right_sse1(ref_XMM0, j); temp_diff_XMM = _mm_xor_si128(shift_XMM, read_XMM0); shift_XMM = shift_right_sse1(ref_XMM1, j); temp_shift_XMM = _mm_xor_si128(shift_XMM, read_XMM1); temp_diff_XMM = _mm_or_si128(temp_shift_XMM, temp_diff_XMM); temp_diff_XMM = _mm_and_si128(temp_diff_XMM, temp_mask); // printf("Before flip: \t"); // print128_bit(temp_diff_XMM); // flip_false_zero(temp_diff_XMM); //No flipping // printf("After flip: \t"); // print128_bit(temp_diff_XMM); diff_XMM = _mm_and_si128(diff_XMM, temp_diff_XMM); //printf("ref shift %d diff_XMM: \n", j); //print128_bit_twice(diff_XMM); } total_difference = popcount1_m128i_sse(diff_XMM); if (total_difference > max_error) return 0; else return 1; }
void InvSubBytes_sse(BYTE state[][4]) { BYTE aes_invsbox[4][4]; aes_invsbox[0][0] = 'a'; aes_invsbox[0][1] = 'b'; aes_invsbox[0][2] = 'c'; aes_invsbox[0][3] = 'd'; aes_invsbox[1][0] = 'e'; aes_invsbox[1][1] = 'f'; aes_invsbox[1][2] = 'g'; aes_invsbox[1][3] = 'h'; aes_invsbox[2][0] = 'i'; aes_invsbox[2][1] = 'j'; aes_invsbox[2][2] = 'k'; aes_invsbox[2][3] = 'l'; aes_invsbox[3][0] = 'm'; aes_invsbox[3][1] = 'n'; aes_invsbox[3][2] = 'o'; aes_invsbox[3][3] = 'p'; /* __m128i stateDiv16 = _mm_set_epi8(state[3][3] >> 4, state[3][2] >> 4, state[3][1] >> 4, state[3][0] >> 4, state[2][3] >> 4, state[2][2] >> 4, state[2][1] >> 4, state[2][0] >> 4, state[1][3] >> 4, state[1][2] >> 4, state[1][1] >> 4, state[1][0] >> 4, state[0][3] >> 4, state[0][2] >> 4, state[0][1] >> 4, state[0][0] >> 4); */ __m128i stateSse = _mm_set_epi8(state[3][3], state[3][2], state[3][1], state[3][0], state[2][3], state[2][2], state[2][1], state[2][0], state[1][3], state[1][2], state[1][1], state[1][0], state[0][3], state[0][2], state[0][1], state[0][0]); __m128i andFSse = _mm_set_epi8(0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F); __m128i stateAndFSse = _mm_and_si128(stateSse, andFSse); BYTE stateAndF[4][4]; _mm_storeu_si128(stateAndF, stateAndFSse); state[0][0] = aes_invsbox[state[0][0] >> 4][stateAndF[0][0]]; state[0][1] = aes_invsbox[state[0][1] >> 4][stateAndF[0][1]]; state[0][2] = aes_invsbox[state[0][2] >> 4][stateAndF[0][2]]; state[0][3] = aes_invsbox[state[0][3] >> 4][stateAndF[0][3]]; state[1][0] = aes_invsbox[state[1][0] >> 4][stateAndF[1][0]]; state[1][1] = aes_invsbox[state[1][1] >> 4][stateAndF[1][1]]; state[1][2] = aes_invsbox[state[1][2] >> 4][stateAndF[1][2]]; state[1][3] = aes_invsbox[state[1][3] >> 4][stateAndF[1][3]]; state[2][0] = aes_invsbox[state[2][0] >> 4][stateAndF[2][0]]; state[2][1] = aes_invsbox[state[2][1] >> 4][stateAndF[2][1]]; state[2][2] = aes_invsbox[state[2][2] >> 4][stateAndF[2][2]]; state[2][3] = aes_invsbox[state[2][3] >> 4][stateAndF[2][3]]; state[3][0] = aes_invsbox[state[3][0] >> 4][stateAndF[3][0]]; state[3][1] = aes_invsbox[state[3][1] >> 4][stateAndF[3][1]]; state[3][2] = aes_invsbox[state[3][2] >> 4][stateAndF[3][2]]; state[3][3] = aes_invsbox[state[3][3] >> 4][stateAndF[3][3]]; }
static int HafCpu_Histogram16Bins_DATA_U8 ( vx_uint32 * dstHist, vx_uint8 distOffset, vx_uint8 distWindow, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i T0 = _mm_set1_epi8((char)(((distOffset ? distOffset : distWindow) - 1) ^ 0x80)); __m128i dT = _mm_set1_epi8((char)distWindow); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" vx_uint32 count[16] = { 0 }; vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; __m128i count0 = _mm_set1_epi8((char)0); __m128i count1 = _mm_set1_epi8((char)0); __m128i count2 = _mm_set1_epi8((char)0); __m128i count3 = _mm_set1_epi8((char)0); for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); __m128i cmpout, Tnext = T0; // 0..3 cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count0 = _mm_add_epi32(count0, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count0 = _mm_add_epi32(count0, cmpout); // 4..7 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count1 = _mm_add_epi32(count1, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count1 = _mm_add_epi32(count1, cmpout); // 8..11 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count2 = _mm_add_epi32(count2, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count2 = _mm_add_epi32(count2, cmpout); // 12..15 Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 16); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 32); count3 = _mm_add_epi32(count3, cmpout); Tnext = _mm_add_epi8(Tnext, dT); cmpout = _mm_cmpgt_epi8(pixels, Tnext); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); cmpout = _mm_slli_epi64(cmpout, 48); count3 = _mm_add_epi32(count3, cmpout); } srcRow += srcImageStrideInBytes; // move counts from count0..2 into count[] for (int i = 0; i < 4; i++) { count[ 0 + i] += M128I(count0).m128i_u16[i] + M128I(count0).m128i_u16[4 + i]; count[ 4 + i] += M128I(count1).m128i_u16[i] + M128I(count1).m128i_u16[4 + i]; count[ 8 + i] += M128I(count2).m128i_u16[i] + M128I(count2).m128i_u16[4 + i]; count[12 + i] += M128I(count3).m128i_u16[i] + M128I(count3).m128i_u16[4 + i]; } } // extract histogram from count if (distOffset == 0) { vx_uint32 last = (distWindow >= 16) ? srcWidth * srcHeight : count[15]; for (int i = 14; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; } dstHist[0] = last; for (int i = 1; i < 16; i++) dstHist[i] = count[i - 1]; } else { vx_uint32 last = srcWidth * srcHeight; for (int i = 15; i >= 0; i--) { count[i] = last - count[i]; last -= count[i]; dstHist[i] = count[i]; } } return AGO_SUCCESS; }
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { unsigned char *running_avg_y_start = running_avg_y; unsigned char *sig_start = sig; unsigned int sum_diff_thresh; int r; int shift_inc = (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ const __m128i l21 = _mm_set1_epi8(1); for (r = 0; r < 16; ++r) { /* Calculate differences */ const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); /* Obtain the sign. FF if diff is negative. */ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); /* Clamp absolute difference to 16 to be used to get mask. Doing this * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16); /* Get masks for l2 l1 and l0 adjustments */ const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); /* Get adjustments for l2, l1, and l0 */ __m128i adj2 = _mm_and_si128(mask2, l32); const __m128i adj1 = _mm_and_si128(mask1, l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; /* Combine the adjustments and get absolute adjustments. */ adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); /* Restore the sign and get positive and negative adjustments. */ padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); /* Calculate filtered value. */ v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); /* Adjustments <=7, and each element in acc_diff can fit in signed * char. */ acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); /* Update pointers for next iteration. */ sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } { /* Compute the sum of all pixel differences of this MB. */ unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); sum_diff_thresh = SUM_DIFF_THRESHOLD; if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; if (abs_sum_diff > sum_diff_thresh) { // Before returning to copy the block (i.e., apply no denoising), // check if we can still apply some (weaker) temporal filtering to // this block, that would otherwise not be denoised at all. Simplest // is to apply an additional adjustment to running_avg_y to bring it // closer to sig. The adjustment is capped by a maximum delta, and // chosen such that in most cases the resulting sum_diff will be // within the acceptable range given by sum_diff_thresh. // The delta is set by the excess of absolute pixel diff over the // threshold. int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; // Only apply the adjustment for max delta up to 3. if (delta < 4) { const __m128i k_delta = _mm_set1_epi8(delta); sig -= sig_stride * 16; mc_running_avg_y -= mc_avg_y_stride * 16; running_avg_y -= avg_y_stride * 16; for (r = 0; r < 16; ++r) { __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0])); // Calculate differences. const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); // Clamp absolute difference to delta to get the adjustment. const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); // Restore the sign and get positive and negative adjustments. __m128i padj, nadj; padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Accumulate the adjustments. acc_diff = _mm_subs_epi8(acc_diff, padj); acc_diff = _mm_adds_epi8(acc_diff, nadj); // Update pointers for next iteration. sig += sig_stride; mc_running_avg_y += mc_avg_y_stride; running_avg_y += avg_y_stride; } abs_sum_diff = abs_sum_diff_16x1(acc_diff); if (abs_sum_diff > sum_diff_thresh) { return COPY_BLOCK; } } else { return COPY_BLOCK; } } }
__m128i operator&(sse_vector a, sse_vector b) { return _mm_and_si128(a.v, b.v); }
static inline __m128i _mm_blendv_epi8_rpl(__m128i a, __m128i b, __m128i mask) { a = _mm_andnot_si128(mask, a); a = _mm_or_si128(a, _mm_and_si128(mask, b)); return a; }
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; v8si imm0, imm2, imm4; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; v4si imm4_1, imm4_2; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); //v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm4_1 = imm2_1; imm4_2 = imm2_2; imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4); imm4 = _mm256_slli_epi32(imm4, 29); #else imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2); imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4); imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); #endif v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x,x); y = *(v8sf*)_ps256_coscof_p0; y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v8sf ysin2 = _mm256_and_ps(xmm3, y2); v8sf ysin1 = _mm256_andnot_ps(xmm3, y); y2 = _mm256_sub_ps(y2,ysin2); y = _mm256_sub_ps(y, ysin1); xmm1 = _mm256_add_ps(ysin1,ysin2); xmm2 = _mm256_add_ps(y,y2); /* update the sign */ *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); }
/* evaluation of 8 sines at onces using AVX intrisics The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. */ v8sf sin256_ps(v8sf x) { // any x v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; v8si imm0, imm2; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; #endif sign_bit = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); /* Here we start a series of integer operations, which are in the realm of AVX2. If we don't have AVX, let's perform them using SSE2 directives */ #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(v8sf*)_ps256_coscof_p0; v8sf z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); return y; }
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1, tmp2, tmp3; // int m = da ? dc * 256 / da : 0; __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); __m128i m = _mm_slli_epi32(dc, 8); __m128 x = _mm_cvtepi32_ps(m); __m128 y = _mm_cvtepi32_ps(da); m = _mm_cvttps_epi32(_mm_div_ps(x, y)); m = _mm_andnot_si128(cmp, m); // if (2 * sc <= sa) tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m tmp1 = Multiply32_SSE2(tmp1, tmp2); tmp1 = _mm_srai_epi32(tmp1, 8); tmp1 = _mm_add_epi32(sa, tmp1); tmp1 = Multiply32_SSE2(dc, tmp1); __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); // else if (4 * dc <= da) tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); __m128i i = _mm_slli_epi32(m, 2); // 4 * m __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) i = _mm_srai_epi32(i, 16); // >> 16 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m tmp2 = _mm_add_epi32(i, j); i = Multiply32_SSE2(dc, sa); // dc * sa j = _mm_slli_epi32(sc, 1); // 2 * sc j = _mm_sub_epi32(j, sa); // 2 * sc - sa j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) tmp2 = Multiply32_SSE2(j, tmp2); // * tmp tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 tmp2 = _mm_add_epi32(i, tmp2); cmp = _mm_andnot_si128(cmp2, cmp1); __m128i rc2 = _mm_and_si128(cmp, tmp2); __m128i rc = _mm_or_si128(rc1, rc2); // else tmp3 = sqrt_unit_byte_SSE2(m); tmp3 = _mm_sub_epi32(tmp3, m); tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) tmp3 = _mm_srai_epi32(tmp3, 8); tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa cmp = _mm_and_si128(cmp1, cmp2); __m128i rc3 = _mm_and_si128(cmp, tmp3); rc = _mm_or_si128(rc, rc3); tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da tmp1 = _mm_mullo_epi16(sc, tmp1); tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa tmp2 = _mm_mullo_epi16(dc, tmp2); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
MEMALIGN(16, __m128i mod_mask); MEMALIGN(16, __m128 res); MEMALIGN(16, static const unsigned int mult [4]) = {214013, 17405, 214013, 69069}; MEMALIGN(16, static const unsigned int gadd [4]) = {2531011, 10395331, 13737667, 1}; MEMALIGN(16, static const unsigned int mask [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0}; adder = _mm_load_si128((__m128i*)gadd); multiplier = _mm_load_si128((__m128i*)mult); mod_mask = _mm_load_si128((__m128i*)mask); cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_mul_epu32(m_sseSeed, multiplier); multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1)); cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier); m_sseSeed = _mm_and_si128(m_sseSeed, mod_mask); cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask); cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_or_si128(m_sseSeed, cur_seed_split); m_sseSeed = _mm_add_epi32(m_sseSeed, adder); /* adjust the value to the range requested */ res = _mm_cvtepi32_ps(m_sseSeed); if (sseresult) *sseresult = _mm_mul_ps(res, f); else { res = _mm_mul_ps(res, f); _mm_storeu_ps(result, res); /* returning a float array, so cleanup */
static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) { __m128i cmp = _mm_cmplt_epi32(a, b); return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b)); }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }