void LoadDefaultSIMDState() { _mm_setcsr(default_sse_state); }
void sc_SetDenormalFlags() { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _mm_setcsr(_mm_getcsr() | 0x40); // DAZ }
void LoadSIMDState() { _mm_setcsr(saved_sse_state); }
void LoadDefaultSSEState() { #ifdef USE_SSE _mm_setcsr(default_sse_state); #endif }
void LoadSSEState() { #ifdef USE_SSE _mm_setcsr(saved_sse_state); #endif }
//------------------------------------------------------------------------------- // For each tile go through all the bins and process all the triangles in it. // Rasterize each triangle to the CPU depth buffer. //------------------------------------------------------------------------------- void DepthBufferRasterizerSSEST::RasterizeBinnedTrianglesToDepthBuffer(UINT tileId, UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_setr_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_setr_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)mpRenderTargetPixels[idx]; // Based on TaskId determine which tile to process UINT screenWidthInTiles = SCREENW/TILE_WIDTH_IN_PIXELS; UINT tileX = tileId % screenWidthInTiles; UINT tileY = tileId / screenWidthInTiles; int tileStartX = tileX * TILE_WIDTH_IN_PIXELS; int tileEndX = tileStartX + TILE_WIDTH_IN_PIXELS - 1; int tileStartY = tileY * TILE_HEIGHT_IN_PIXELS; int tileEndY = tileStartY + TILE_HEIGHT_IN_PIXELS - 1; ClearDepthTile(tileStartX, tileStartY, tileEndX+1, tileEndY+1, idx); UINT bin = 0; UINT binIndex = 0; UINT offset1 = YOFFSET1_ST * tileY + XOFFSET1_ST * tileX; UINT offset2 = YOFFSET2_ST * tileY + XOFFSET2_ST * tileX; UINT numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; __m128 gatherBuf[4][3]; bool done = false; bool allBinsEmpty = true; mNumRasterizedTris[idx][tileId] = numTrisInBin; while(!done) { // Loop through all the bins and process 4 binned traingles at a time UINT ii; int numSimdTris = 0; for(ii = 0; ii < SSE; ii++) { while(numTrisInBin <= 0) { // This bin is empty. Move to next bin. if(++bin >= 1) { break; } numTrisInBin = mpNumTrisInBin[idx][offset1 + bin]; mNumRasterizedTris[idx][tileId] += numTrisInBin; binIndex = 0; } if(!numTrisInBin) { break; // No more tris in the bins } USHORT modelId = mpBinModel[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; USHORT meshId = mpBinMesh[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; UINT triIdx = mpBin[idx][offset2 + bin * MAX_TRIS_IN_BIN_ST + binIndex]; mpTransformedModels1[modelId].Gather(gatherBuf[ii], meshId, triIdx, idx); allBinsEmpty = false; numSimdTris++; ++binIndex; --numTrisInBin; } done = bin >= NUM_XFORMVERTS_TASKS; if(allBinsEmpty) { return; } // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; __m128 Z[3]; for(int i = 0; i < 3; i++) { __m128 v0 = gatherBuf[0][i]; __m128 v1 = gatherBuf[1][i]; __m128 v2 = gatherBuf[2][i]; __m128 v3 = gatherBuf[3][i]; // transpose into SoA layout _MM_TRANSPOSE4_PS(v0, v1, v2, v3); fxPtX[i] = _mm_cvtps_epi32(v0); fxPtY[i] = _mm_cvtps_epi32(v1); Z[i] = v2; } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); Z[1] = _mm_mul_ps(_mm_sub_ps(Z[1], Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(Z[2], Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(tileStartX)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndX)); __m128i startY = _mm_and_si128(Max(Min(Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(tileStartY)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(1)), _mm_set1_epi32(tileEndY)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < numSimdTris; lane++) { // Extract this triangle's properties from the SIMD versions __m128 zz[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), _mm_set1_epi32(C0.m128i_i32[lane])); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), _mm_set1_epi32(C1.m128i_i32[lane])); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), _mm_set1_epi32(C2.m128i_i32[lane])); __m128i sum0Row = _mm_add_epi32(aa0Col, bb0Row); __m128i sum1Row = _mm_add_epi32(aa1Col, bb1Row); __m128i sum2Row = _mm_add_epi32(aa2Col, bb2Row); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); __m128 zx = _mm_mul_ps(_mm_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm_add_ps(zx, _mm_mul_ps(_mm_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm_add_epi32(sum0Row, bb0Inc), sum1Row = _mm_add_epi32(sum1Row, bb1Inc), sum2Row = _mm_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m128i alpha = sum0Row; __m128i beta = sum1Row; __m128i gama = sum2Row; //Compute barycentric-interpolated depth __m128 depth = zz[0]; depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); for(int c = startXx; c < endXx; c += 2, index += 4, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc), depth = _mm_add_ps(depth, zx)) { //Test Pixel inside triangle __m128i mask = _mm_or_si128(_mm_or_si128(alpha, beta), gama); __m128 previousDepthValue = _mm_load_ps(&pDepthBuffer[index]); __m128 mergedDepth = _mm_max_ps(depth, previousDepthValue); __m128 finalDepth = _mm_blendv_ps(mergedDepth, previousDepthValue, _mm_castsi128_ps(mask)); _mm_store_ps(&pDepthBuffer[index], finalDepth); }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
} else {
void SetControlWord(unsigned control) { _mm_setcsr(LastKnownControlWord = control); }
void SetControlWordAddative(unsigned control) { _mm_setcsr(LastKnownControlWord |= control); }
//Rasterize 4 pixels at once void DepthBuffer::rasterizeTile2x2(int32 x,int32 y,uint32 pass) { auto tileIndex = x + y*tileCount_.x; auto count = tileTriangleCount_[tileIndex]; tileTriangleCount_[tileIndex] = 0; auto faces = triangleBins_ + x*kMaxTrianglesPerTile + y*tileCount_.x*kMaxTrianglesPerTile; vec2i tilePos(x*tileSize_.x,y*tileSize_.y); vec2i tileEnd(tilePos + tileSize_); #ifdef ARPHEG_ARCH_X86 enum { kNumLanes = 4 }; //Flush denormals to zero _mm_setcsr( _mm_getcsr() | 0x8040 ); VecS32 colOffset(0, 1, 0, 1); VecS32 rowOffset(0, 0, 1, 1); //Process the 4 binned triangles at a time VecS32 vertexX[3]; VecS32 vertexY[3]; VecF32 vertexZ[4]; VecS32 tileMinXSimd(tilePos.x); VecS32 tileMaxXSimd(tilePos.x+tileSize_.x-2); VecS32 tileMinYSimd(tilePos.y); VecS32 tileMaxYSimd(tilePos.y+tileSize_.y-2); for(uint32 i = 0;i<count;i += kNumLanes){ uint32 numSimdTris = std::min(uint32(kNumLanes),count-i); auto f = faces+i; for(uint32 ii = 0;ii< numSimdTris;++ii){ vertexX[0].lane[ii] = f[ii].v[0].x; vertexY[0].lane[ii] = f[ii].v[0].y; vertexX[1].lane[ii] = f[ii].v[1].x; vertexY[1].lane[ii] = f[ii].v[1].y; vertexX[2].lane[ii] = f[ii].v[2].x; vertexY[2].lane[ii] = f[ii].v[2].y; vertexZ[ii] = VecF32(f[ii].z[0],f[ii].z[1],f[ii].z[2],0.0f); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle VecS32 A0 = vertexY[1] - vertexY[2]; VecS32 A1 = vertexY[2] - vertexY[0]; VecS32 A2 = vertexY[0] - vertexY[1]; // Compute B = (xb - xa) for the 3 line segments that make up each triangle VecS32 B0 = vertexX[2] - vertexX[1]; VecS32 B1 = vertexX[0] - vertexX[2]; VecS32 B2 = vertexX[1] - vertexX[0]; // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle VecS32 C0 = vertexX[1] * vertexY[2] - vertexX[2] * vertexY[1]; VecS32 C1 = vertexX[2] * vertexY[0] - vertexX[0] * vertexY[2]; VecS32 C2 = vertexX[0] * vertexY[1] - vertexX[1] * vertexY[0]; // Use bounding box traversal strategy to determine which pixels to rasterize VecS32 minX = vmax(vmin(vmin(vertexX[0], vertexX[1]), vertexX[2]), tileMinXSimd) & VecS32(~1); VecS32 maxX = vmin(vmax(vmax(vertexX[0], vertexX[1]), vertexX[2]), tileMaxXSimd); VecS32 minY = vmax(vmin(vmin(vertexY[0], vertexY[1]), vertexY[2]), tileMinYSimd) & VecS32(~1); VecS32 maxY = vmin(vmax(vmax(vertexY[0], vertexY[1]), vertexY[2]), tileMaxYSimd); //Rasterize each triangle individually for(uint32 lane = 0;lane < numSimdTris;++lane){ //Rasterize in 2x2 quads. VecF32 zz[3]; zz[0] = VecF32(vertexZ[lane].lane[0]); zz[1] = VecF32(vertexZ[lane].lane[1]); zz[2] = VecF32(vertexZ[lane].lane[2]); VecS32 a0(A0.lane[lane]); VecS32 a1(A1.lane[lane]); VecS32 a2(A2.lane[lane]); VecS32 b0(B0.lane[lane]); VecS32 b1(B1.lane[lane]); VecS32 b2(B2.lane[lane]); int32 minx = minX.lane[lane]; int32 maxx = maxX.lane[lane]; int32 miny = minY.lane[lane]; int32 maxy = maxY.lane[lane]; VecS32 col = VecS32(minx) + colOffset; VecS32 row = VecS32(miny) + rowOffset; auto rowIdx = miny*size_.x + 2 * minx; VecS32 w0_row = a0 * col + b0 * row + VecS32(C0.lane[lane]); VecS32 w1_row = a1 * col + b1 * row + VecS32(C1.lane[lane]); VecS32 w2_row = a2 * col + b2 * row + VecS32(C2.lane[lane]); //Multiply each weight by two(rasterize 2x2 quad at once). a0 = shiftl<1>(a0); a1 = shiftl<1>(a1); a2 = shiftl<1>(a2); b0 = shiftl<1>(b0); b1 = shiftl<1>(b1); b2 = shiftl<1>(b2); VecF32 zInc = itof(a1)*zz[1] + itof(a2)*zz[2]; for(int32 y = miny;y<=maxy;y+=2,rowIdx += 2 * size_.x){ auto w0 = w0_row; auto w1 = w1_row; auto w2 = w2_row; VecF32 depth = zz[0] + itof(w1)*zz[1] + itof(w2)*zz[2]; auto idx = rowIdx; for(int32 x = minx;x<=maxx;x+=2,idx+=4){ auto mask = w0|w1|w2; VecF32 previousDepth = VecF32::load(data_+idx); VecF32 mergedDepth = vmin(depth,previousDepth); previousDepth = select(mergedDepth,previousDepth,mask); previousDepth.store(data_+idx); w0+=a0; w1+=a1; w2+=a2; depth+=zInc; } w0_row += b0; w1_row += b1; w2_row += b2; } } } #endif }
void Permutohedral::init ( const MatrixXf & feature ) { // Compute the lattice coordinates for each feature [there is going to be a lot of magic here N_ = feature.cols(); d_ = feature.rows(); HashTable hash_table( d_, N_/**(d_+1)*/ ); const int blocksize = sizeof(__m128) / sizeof(float); const __m128 invdplus1 = _mm_set1_ps( 1.0f / (d_+1) ); const __m128 dplus1 = _mm_set1_ps( d_+1 ); const __m128 Zero = _mm_set1_ps( 0 ); const __m128 One = _mm_set1_ps( 1 ); // Allocate the class memory offset_.resize( (d_+1)*(N_+16) ); std::fill( offset_.begin(), offset_.end(), 0 ); barycentric_.resize( (d_+1)*(N_+16) ); std::fill( barycentric_.begin(), barycentric_.end(), 0 ); rank_.resize( (d_+1)*(N_+16) ); // Allocate the local memory __m128 * scale_factor = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * f = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * elevated = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rem0 = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rank = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 ); float * barycentric = new float[(d_+2)*blocksize]; short * canonical = new short[(d_+1)*(d_+1)]; short * key = new short[d_+1]; // Compute the canonical simplex for( int i=0; i<=d_; i++ ){ for( int j=0; j<=d_-i; j++ ) canonical[i*(d_+1)+j] = i; for( int j=d_-i+1; j<=d_; j++ ) canonical[i*(d_+1)+j] = i - (d_+1); } // Expected standard deviation of our filter (p.6 in [Adams etal 2010]) float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1); // Compute the diagonal part of E (p.5 in [Adams etal 2010]) for( int i=0; i<d_; i++ ) scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( (i+2)*(i+1) ) * inv_std_dev ); // Setup the SSE rounding #ifndef __SSE4_1__ const unsigned int old_rounding = _mm_getcsr(); _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST ); #endif // Compute the simplex each feature lies in for( int k=0; k<N_; k+=blocksize ){ // Load the feature from memory float * ff = (float*)f; for( int j=0; j<d_; j++ ) for( int i=0; i<blocksize; i++ ) ff[ j*blocksize + i ] = k+i < N_ ? feature(j,k+i) : 0.0; // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010]) // sm contains the sum of 1..n of our faeture vector __m128 sm = Zero; for( int j=d_; j>0; j-- ){ __m128 cf = f[j-1]*scale_factor[j-1]; elevated[j] = sm - _mm_set1_ps(j)*cf; sm += cf; } elevated[0] = sm; // Find the closest 0-colored simplex through rounding __m128 sum = Zero; for( int i=0; i<=d_; i++ ){ __m128 v = invdplus1 * elevated[i]; #ifdef __SSE4_1__ v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT ); #else v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) ); #endif rem0[i] = v*dplus1; sum += v; } // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values) for( int i=0; i<=d_; i++ ) rank[i] = Zero; for( int i=0; i<d_; i++ ){ __m128 di = elevated[i] - rem0[i]; for( int j=i+1; j<=d_; j++ ){ __m128 dj = elevated[j] - rem0[j]; __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) ); rank[i] += c; rank[j] += One-c; } } // If the point doesn't lie on the plane (sum != 0) bring it back for( int i=0; i<=d_; i++ ){ rank[i] += sum; __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) ); __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) ); rank[i] += add-sub; rem0[i] += add-sub; } // Compute the barycentric coordinates (p.10 in [Adams etal 2010]) for( int i=0; i<(d_+2)*blocksize; i++ ) barycentric[ i ] = 0; for( int i=0; i<=d_; i++ ){ __m128 v = (elevated[i] - rem0[i])*invdplus1; // Didn't figure out how to SSE this float * fv = (float*)&v; float * frank = (float*)&rank[i]; for( int j=0; j<blocksize; j++ ){ int p = d_-frank[j]; barycentric[j*(d_+2)+p ] += fv[j]; barycentric[j*(d_+2)+p+1] -= fv[j]; } } // The rest is not SSE'd for( int j=0; j<blocksize; j++ ){ // Wrap around barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1]; float * frank = (float*)rank; float * frem0 = (float*)rem0; // Compute all vertices and their offset for( int remainder=0; remainder<=d_; remainder++ ){ for( int i=0; i<d_; i++ ){ key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ]; } offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true ); rank_[ (j+k)*(d_+1)+remainder ] = frank[remainder*blocksize+j]; barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ]; } } } _mm_free( scale_factor ); _mm_free( f ); _mm_free( elevated ); _mm_free( rem0 ); _mm_free( rank ); delete [] barycentric; delete [] canonical; delete [] key; // Reset the SSE rounding #ifndef __SSE4_1__ _mm_setcsr( old_rounding ); #endif // This is normally fast enough so no SSE needed here // Find the Neighbors of each lattice point // Get the number of vertices in the lattice M_ = hash_table.size(); // Create the neighborhood structure blur_neighbors_.resize( (d_+1)*M_ ); short * n1 = new short[d_+1]; short * n2 = new short[d_+1]; // For each of d+1 axes, for( int j = 0; j <= d_; j++ ){ for( int i=0; i<M_; i++ ){ const short * key = hash_table.getKey( i ); for( int k=0; k<d_; k++ ){ n1[k] = key[k] - 1; n2[k] = key[k] + 1; } n1[j] = key[j] + d_; n2[j] = key[j] - d_; blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 ); blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 ); } } delete[] n1; delete[] n2; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
void f0() { signed char tmp_c; // unsigned char tmp_Uc; signed short tmp_s; #ifdef USE_ALL unsigned short tmp_Us; #endif signed int tmp_i; unsigned int tmp_Ui; signed long long tmp_LLi; unsigned long long tmp_ULLi; float tmp_f; double tmp_d; void* tmp_vp; const void* tmp_vCp; char* tmp_cp; const char* tmp_cCp; int* tmp_ip; float* tmp_fp; const float* tmp_fCp; double* tmp_dp; const double* tmp_dCp; long long* tmp_LLip; #define imm_i 32 #define imm_i_0_2 0 #define imm_i_0_4 3 #define imm_i_0_8 7 #define imm_i_0_16 15 // Check this. #define imm_i_0_256 0 V2i* tmp_V2ip; V1LLi* tmp_V1LLip; V2LLi* tmp_V2LLip; // 64-bit V8c tmp_V8c; V4s tmp_V4s; V2i tmp_V2i; V1LLi tmp_V1LLi; #ifdef USE_3DNOW V2f tmp_V2f; #endif // 128-bit V16c tmp_V16c; V8s tmp_V8s; V4i tmp_V4i; V2LLi tmp_V2LLi; V4f tmp_V4f; V2d tmp_V2d; V2d* tmp_V2dp; V4f* tmp_V4fp; const V2d* tmp_V2dCp; const V4f* tmp_V4fCp; // 256-bit V32c tmp_V32c; V4d tmp_V4d; V8f tmp_V8f; V4LLi tmp_V4LLi; V8i tmp_V8i; V4LLi* tmp_V4LLip; V4d* tmp_V4dp; V8f* tmp_V8fp; const V4d* tmp_V4dCp; const V8f* tmp_V8fCp; tmp_V2LLi = __builtin_ia32_undef128(); tmp_V4LLi = __builtin_ia32_undef256(); tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f); tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d); tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s); tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d); tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c); tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i); tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c); tmp_V8c = __builtin_ia32_pabsb(tmp_V8c); tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s); tmp_V4s = __builtin_ia32_pabsw(tmp_V4s); tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i); tmp_V2i = __builtin_ia32_pabsd(tmp_V2i); tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi); tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s); tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0); __builtin_ia32_incsspd(tmp_Ui); __builtin_ia32_incsspq(tmp_ULLi); tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui); tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi); __builtin_ia32_saveprevssp(); __builtin_ia32_rstorssp(tmp_vp); __builtin_ia32_wrssd(tmp_Ui, tmp_vp); __builtin_ia32_wrssq(tmp_ULLi, tmp_vp); __builtin_ia32_wrussd(tmp_Ui, tmp_vp); __builtin_ia32_wrussq(tmp_ULLi, tmp_vp); __builtin_ia32_setssbsy(); __builtin_ia32_clrssbsy(tmp_vp); (void) __builtin_ia32_ldmxcsr(tmp_Ui); (void) _mm_setcsr(tmp_Ui); tmp_Ui = __builtin_ia32_stmxcsr(); tmp_Ui = _mm_getcsr(); (void)__builtin_ia32_fxsave(tmp_vp); (void)__builtin_ia32_fxsave64(tmp_vp); (void)__builtin_ia32_fxrstor(tmp_vp); (void)__builtin_ia32_fxrstor64(tmp_vp); (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi); (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui); (void) __builtin_ia32_clzero(tmp_vp); tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); tmp_i = __builtin_ia32_rdtsc(); tmp_i = __rdtsc(); tmp_i = __builtin_ia32_rdtscp(&tmp_Ui); tmp_LLi = __builtin_ia32_rdpmc(tmp_i); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); (void) _mm_sfence(); tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c); tmp_V4f = __builtin_ia32_rcpps(tmp_V4f); tmp_V4f = __builtin_ia32_rcpss(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f); (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); (void) __builtin_ia32_movnti(tmp_ip, tmp_i); #ifdef USE_64 (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi); #endif tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); tmp_V4f = __builtin_ia32_cvtdq2ps(tmp_V4i); tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d); tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d); tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d); tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d); #endif tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f); tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f); (void) __builtin_ia32_clflush(tmp_vCp); (void) _mm_clflush(tmp_vCp); (void) __builtin_ia32_lfence(); (void) _mm_lfence(); (void) __builtin_ia32_mfence(); (void) _mm_mfence(); (void) __builtin_ia32_pause(); (void) _mm_pause(); tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i); tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i); tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i); tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i); tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i); tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s); (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i); tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f); tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s); tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i); tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16); tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16); tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16); tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16); tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256); #endif tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f); tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi); tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i); tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi); tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i); tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f); tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0); tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0); tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d); tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f); tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d); tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d); tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f); tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d); tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f); tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1); tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1); tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_movmskpd256(tmp_V4d); tmp_i = __builtin_ia32_movmskps256(tmp_V8f); __builtin_ia32_vzeroall(); __builtin_ia32_vzeroupper(); tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp); tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp); tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp); tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi); tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i); tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi); tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i); __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d); __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f); __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d); __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f); #ifdef USE_3DNOW tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c); tmp_V2i = __builtin_ia32_pf2id(tmp_V2f); tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i); tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f); tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i); tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f); tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i); tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4); tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i); #endif }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); RDTSC_INIT(threadId); uint32_t numaNode = pThreadData->numaId; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint64_t curDrawBE = 0; uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } if (IsBEThread) { RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
void RemoveControlWord(unsigned control) { _mm_setcsr(LastKnownControlWord &= ~control); }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_set_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_set_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i); // use fixed-point only for X and Y. Avoid work for Z and W. vFxPt4 xFormedFxPtPos[3]; for(int m = 0; m < 3; m++) { xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X); xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y); xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z); xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y); __m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y); __m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X); __m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X); __m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y)); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y)); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y)); // Compute triangle area __m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X); triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y)); triArea = _mm_add_epi32(triArea, C0); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW)); __m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH)); for(int vv = 0; vv < 3; vv++) { // If W (holding 1/w in our case) is not between 0 and 1, // then vertex is behind near clip plane (1.0 in our case. // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0). __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f)); __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f)); __m128 nearClipMask = _mm_or_ps(nearClipMask0, nearClipMask1); if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask)) { // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE) *mVisible = true; return; } } // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3], oneOverW[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]); oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]); } __m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]); zz[0] *= oneOverTotalArea; zz[1] *= oneOverTotalArea; zz[2] *= oneOverTotalArea; int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]); __m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]); __m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; int rowIdx; // To avoid this branching, choose one method to traverse and store the pixel depth if(gVisualizeDepthBuffer) { // Sequentially traverse and store pixel depths contiguously rowIdx = (startYy * SCREENW + startXx); } else { // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance rowIdx = (startYy * SCREENW + 2 * startXx); } col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, row = _mm_add_epi32(row, _mm_set1_epi32(2)), rowIdx = rowIdx + 2 * SCREENW, bb0Row = _mm_add_epi32(bb0Row, bb0Inc), bb1Row = _mm_add_epi32(bb1Row, bb1Inc), bb2Row = _mm_add_epi32(bb2Row, bb2Inc)) { // Compute barycentric coordinates int idx = rowIdx; __m128i alpha = _mm_add_epi32(aa0Col, bb0Row); __m128i beta = _mm_add_epi32(aa1Col, bb1Row); __m128i gama = _mm_add_epi32(aa2Col, bb2Row); int idxIncr; if(gVisualizeDepthBuffer) { idxIncr = 2; } else { idxIncr = 4; } for(int c = startXx; c < endXx; c += 2, idx = idx + idxIncr, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc)) { //Test Pixel inside triangle __m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama)); // Early out if all of this quad's pixels are outside the triangle. if(_mm_test_all_zeros(mask, mask)) { continue; } // Compute barycentric-interpolated depth __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); __m128 previousDepthValue; if(gVisualizeDepthBuffer) { previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]); } else { previousDepthValue = *(__m128*)&pDepthBuffer[idx]; } __m128 depthMask = _mm_cmpge_ps( depth, previousDepthValue); __m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask)); if(!_mm_test_all_zeros(finalMask, finalMask)) { *mVisible = true; return; //early exit } }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
//#define MAXMAXVECTOR 4096 // double precision interpolating (smooth) with input void diresonators_perform64(t_resonators *x, t_object *dsp64, double **ins, long numins, double **outs, long numouts, long sampleframes, long flags, void *userparam) { const double *in = *ins; t_resonators *op = x; double *out = *outs; long n = sampleframes; int nfilters = op->nres; register double yn,yo; int i, j; double rate = 1.0/n; if(op->b_obj.z_disabled){ return; } #ifdef SQUASH_DENORMALS static int sq; if(!sq){ printf("squashing denormals\n"); sq++; } #if defined( __i386__ ) || defined( __x86_64__ ) int oldMXCSR = _mm_getcsr(); // read the old MXCSR setting int newMXCSR = oldMXCSR | 0x8040; // set DAZ and FZ bits _mm_setcsr( newMXCSR ); // write the new MXCSR setting to the MXCSR #endif #endif { dresdesc *f = op->dbase; for(j=0;j<n;++j) out[j] = 0.0; for(i=0;i< nfilters ;++i) { register double b1=f[i].o_b1, b2=f[i].o_b2, a1=f[i].o_a1; double a1inc = (f[i].a1-f[i].o_a1) * rate; double b1inc = (f[i].b1-f[i].o_b1) * rate; double b2inc = (f[i].b2-f[i].o_b2) * rate; yo= f[i].out1; yn =f[i].out2; for(j=0;j<n;++j) { double x = yo; yo = b1*yo + b2*yn + a1*in[j]; out[j] += yo; yn = x; a1 += a1inc; b1 += b1inc; b2 += b2inc; } f[i].o_a1 = f[i].a1; f[i].o_b1 = f[i].b1; f[i].o_b2 = f[i].b2; f[i].out1= yo; f[i].out2 = yn; } } #ifdef SQUASH_DENORMALS #if defined( __i386__ ) || defined( __x86_64__ ) _mm_setcsr(oldMXCSR); #endif #endif }