//Binning static void gather4Simd(VecF32Soa dest[3],VecF32 vertices[12]){ for(uint32 i = 0;i<3;++i){ __m128 v0 = vertices[i].simd; //x0, y0, z0, w0 __m128 v1 = vertices[3+i].simd;//x1, y1, z1, w1 __m128 v2 = vertices[6+i].simd;//x2, y2, z2, w2 __m128 v3 = vertices[9+i].simd;//x3, y3, z3, w3 _MM_TRANSPOSE4_PS(v0, v1, v2, v3); dest[i].x = VecF32(v0); dest[i].y = VecF32(v1); dest[i].z = VecF32(v2); dest[i].w = VecF32(v3); } }
VecF32 LinearAlgebra::eigenValue(const Mat2x<F32,2,2> &m){ F32 T = m.trace(); F32 D = m.determinant(); F32 sum = T*T/4 -D; VecF32 eigen_value(2); if(sum>0) { sum = std::sqrt(sum); eigen_value(0) = T/2 + (sum); eigen_value(1) = T/2 - (sum); return eigen_value; }else{ return VecF32(); } }
void LinearAlgebra::QRDecomposition(const Mat2F32 &m, Mat2F32 &Q, Mat2F32 &R){ Q = LinearAlgebra::orthogonalGramSchmidt(m); R.clear(); R.resize(m.sizeI(),m.sizeJ()); std::vector<VecF32> v_a(m.sizeI(),VecF32(m.sizeI())); for(unsigned int j =0;j<m.sizeJ();j++) v_a[j]=m.getCol(j); for(unsigned int i =0;i<m.sizeI();i++){ VecF32 e = Q.getCol(i); for(unsigned int j =i;j<m.sizeJ();j++){ R(i,j)=productInner(e,v_a[j]); } } }
VecF32 DistributionMultiVariateRegularStep::randomVariable()const { F32 u = this->uni.randomVariable(); std::vector<F32>::const_iterator low=std::upper_bound (_repartition.begin(), _repartition.end(),u ); // I32 indice = I32(low- _repartition.begin()) ; if(_xmin.size()==2){ Vec2I32 v; v(0)= indice/_mat2d.sizeJ(); v(1)= indice-v(0)*_mat2d.sizeJ(); VecF32 vv(2); vv(0)=v(0)*_step+_xmin(0);vv(1)=v(1)*_step+_xmin(1); return vv; } else{ std::cerr<<"work only for two variates"; return VecF32(); } }
Mat2F32 LinearAlgebra::orthogonalGramSchmidt(const Mat2F32& m) { if(m.sizeI()!=m.sizeI()) std::cerr<<"In linearAlgebra::orthogonalGramSchmidt, Mat2F32 must be square"; Vec<VecF32> u(m.sizeI(),VecF32(m.sizeI())); for(unsigned int k=0;k<m.sizeI();k++){ VecF32 v_k = m.getCol(k); VecF32 temp(m.sizeI()); for(unsigned int p=0;p<k;p++){ temp+=productInner(u[p],v_k)/productInner(u[p],u[p])*u[p]; } u(k)=v_k-temp; } Mat2F32 out(m.sizeI(),m.sizeI()); for(unsigned int k=0;k<m.sizeI();k++){ u(k)/=u(k).norm(); out.setCol(k,u(k)); } return out; }
VecF32 DistributionMultiVariateExpression::randomVariable()const { std::cerr<<"In distributionMultiVariateArithmetic::randomVariable(), no probability distribution, you have to use pop::Statistics::toProbabilityDistribution"; return VecF32(); }
//Rasterize 4 pixels at once void DepthBuffer::rasterizeTile2x2(int32 x,int32 y,uint32 pass) { auto tileIndex = x + y*tileCount_.x; auto count = tileTriangleCount_[tileIndex]; tileTriangleCount_[tileIndex] = 0; auto faces = triangleBins_ + x*kMaxTrianglesPerTile + y*tileCount_.x*kMaxTrianglesPerTile; vec2i tilePos(x*tileSize_.x,y*tileSize_.y); vec2i tileEnd(tilePos + tileSize_); #ifdef ARPHEG_ARCH_X86 enum { kNumLanes = 4 }; //Flush denormals to zero _mm_setcsr( _mm_getcsr() | 0x8040 ); VecS32 colOffset(0, 1, 0, 1); VecS32 rowOffset(0, 0, 1, 1); //Process the 4 binned triangles at a time VecS32 vertexX[3]; VecS32 vertexY[3]; VecF32 vertexZ[4]; VecS32 tileMinXSimd(tilePos.x); VecS32 tileMaxXSimd(tilePos.x+tileSize_.x-2); VecS32 tileMinYSimd(tilePos.y); VecS32 tileMaxYSimd(tilePos.y+tileSize_.y-2); for(uint32 i = 0;i<count;i += kNumLanes){ uint32 numSimdTris = std::min(uint32(kNumLanes),count-i); auto f = faces+i; for(uint32 ii = 0;ii< numSimdTris;++ii){ vertexX[0].lane[ii] = f[ii].v[0].x; vertexY[0].lane[ii] = f[ii].v[0].y; vertexX[1].lane[ii] = f[ii].v[1].x; vertexY[1].lane[ii] = f[ii].v[1].y; vertexX[2].lane[ii] = f[ii].v[2].x; vertexY[2].lane[ii] = f[ii].v[2].y; vertexZ[ii] = VecF32(f[ii].z[0],f[ii].z[1],f[ii].z[2],0.0f); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle VecS32 A0 = vertexY[1] - vertexY[2]; VecS32 A1 = vertexY[2] - vertexY[0]; VecS32 A2 = vertexY[0] - vertexY[1]; // Compute B = (xb - xa) for the 3 line segments that make up each triangle VecS32 B0 = vertexX[2] - vertexX[1]; VecS32 B1 = vertexX[0] - vertexX[2]; VecS32 B2 = vertexX[1] - vertexX[0]; // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle VecS32 C0 = vertexX[1] * vertexY[2] - vertexX[2] * vertexY[1]; VecS32 C1 = vertexX[2] * vertexY[0] - vertexX[0] * vertexY[2]; VecS32 C2 = vertexX[0] * vertexY[1] - vertexX[1] * vertexY[0]; // Use bounding box traversal strategy to determine which pixels to rasterize VecS32 minX = vmax(vmin(vmin(vertexX[0], vertexX[1]), vertexX[2]), tileMinXSimd) & VecS32(~1); VecS32 maxX = vmin(vmax(vmax(vertexX[0], vertexX[1]), vertexX[2]), tileMaxXSimd); VecS32 minY = vmax(vmin(vmin(vertexY[0], vertexY[1]), vertexY[2]), tileMinYSimd) & VecS32(~1); VecS32 maxY = vmin(vmax(vmax(vertexY[0], vertexY[1]), vertexY[2]), tileMaxYSimd); //Rasterize each triangle individually for(uint32 lane = 0;lane < numSimdTris;++lane){ //Rasterize in 2x2 quads. VecF32 zz[3]; zz[0] = VecF32(vertexZ[lane].lane[0]); zz[1] = VecF32(vertexZ[lane].lane[1]); zz[2] = VecF32(vertexZ[lane].lane[2]); VecS32 a0(A0.lane[lane]); VecS32 a1(A1.lane[lane]); VecS32 a2(A2.lane[lane]); VecS32 b0(B0.lane[lane]); VecS32 b1(B1.lane[lane]); VecS32 b2(B2.lane[lane]); int32 minx = minX.lane[lane]; int32 maxx = maxX.lane[lane]; int32 miny = minY.lane[lane]; int32 maxy = maxY.lane[lane]; VecS32 col = VecS32(minx) + colOffset; VecS32 row = VecS32(miny) + rowOffset; auto rowIdx = miny*size_.x + 2 * minx; VecS32 w0_row = a0 * col + b0 * row + VecS32(C0.lane[lane]); VecS32 w1_row = a1 * col + b1 * row + VecS32(C1.lane[lane]); VecS32 w2_row = a2 * col + b2 * row + VecS32(C2.lane[lane]); //Multiply each weight by two(rasterize 2x2 quad at once). a0 = shiftl<1>(a0); a1 = shiftl<1>(a1); a2 = shiftl<1>(a2); b0 = shiftl<1>(b0); b1 = shiftl<1>(b1); b2 = shiftl<1>(b2); VecF32 zInc = itof(a1)*zz[1] + itof(a2)*zz[2]; for(int32 y = miny;y<=maxy;y+=2,rowIdx += 2 * size_.x){ auto w0 = w0_row; auto w1 = w1_row; auto w2 = w2_row; VecF32 depth = zz[0] + itof(w1)*zz[1] + itof(w2)*zz[2]; auto idx = rowIdx; for(int32 x = minx;x<=maxx;x+=2,idx+=4){ auto mask = w0|w1|w2; VecF32 previousDepth = VecF32::load(data_+idx); VecF32 mergedDepth = vmin(depth,previousDepth); previousDepth = select(mergedDepth,previousDepth,mask); previousDepth.store(data_+idx); w0+=a0; w1+=a1; w2+=a2; depth+=zInc; } w0_row += b0; w1_row += b1; w2_row += b2; } } } #endif }
void DepthBuffer::rasterizeTile(int32 x,int32 y,uint32 pass) { if(pass == 0){ //init tile(clear depth). //auto tilePixels = data_ + x*tileSize_.x*tileSize_.y + (y*tileSize_.x*tileSize_.y)*tileCount_.x; //clearDepth(tilePixels,tileSize_.x*tileSize_.y,1.0f); } if(mode_ == kModeDepthPackedQuads){ rasterizeTile2x2(x,y,pass); return; } auto tileIndex = x + y*tileCount_.x; auto count = tileTriangleCount_[tileIndex]; tileTriangleCount_[tileIndex] = 0; auto faces = triangleBins_ + x*kMaxTrianglesPerTile + y*tileCount_.x*kMaxTrianglesPerTile; vec2i tilePos(x*tileSize_.x,y*tileSize_.y); vec2i tileEnd(tilePos + tileSize_); #ifdef ARPHEG_ARCH_X86 enum { kNumLanes = 4 }; //Flush denormals to zero //_mm_setcsr( _mm_getcsr() | 0x8040 ); VecS32 colOffset(0, 1, 0, 1); VecS32 rowOffset(0, 0, 1, 1); //Process the 4 binned triangles at a time VecS32 vertexX[3]; VecS32 vertexY[3]; VecF32 vertexZ[4]; VecS32 tileMinXSimd(tilePos.x); VecS32 tileMaxXSimd(tilePos.x+tileSize_.x-1); VecS32 tileMinYSimd(tilePos.y); VecS32 tileMaxYSimd(tilePos.y+tileSize_.y-1); for(uint32 i = 0;i<count;i += kNumLanes){ uint32 numSimdTris = std::min(uint32(kNumLanes),count-i); auto f = faces+i; for(uint32 ii = 0;ii< numSimdTris;++ii){ vertexX[0].lane[ii] = f[ii].v[0].x; vertexY[0].lane[ii] = f[ii].v[0].y; vertexX[1].lane[ii] = f[ii].v[1].x; vertexY[1].lane[ii] = f[ii].v[1].y; vertexX[2].lane[ii] = f[ii].v[2].x; vertexY[2].lane[ii] = f[ii].v[2].y; vertexZ[ii] = VecF32(f[ii].z[0],f[ii].z[1],f[ii].z[2],0.0f); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle VecS32 A0 = vertexY[1] - vertexY[2]; VecS32 A1 = vertexY[2] - vertexY[0]; VecS32 A2 = vertexY[0] - vertexY[1]; // Compute B = (xb - xa) for the 3 line segments that make up each triangle VecS32 B0 = vertexX[2] - vertexX[1]; VecS32 B1 = vertexX[0] - vertexX[2]; VecS32 B2 = vertexX[1] - vertexX[0]; // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle VecS32 C0 = vertexX[1] * vertexY[2] - vertexX[2] * vertexY[1]; VecS32 C1 = vertexX[2] * vertexY[0] - vertexX[0] * vertexY[2]; VecS32 C2 = vertexX[0] * vertexY[1] - vertexX[1] * vertexY[0]; // Use bounding box traversal strategy to determine which pixels to rasterize VecS32 minX = vmax(vmin(vmin(vertexX[0], vertexX[1]), vertexX[2]), tileMinXSimd); VecS32 maxX = vmin(vmax(vmax(vertexX[0], vertexX[1]), vertexX[2]), tileMaxXSimd); VecS32 minY = vmax(vmin(vmin(vertexY[0], vertexY[1]), vertexY[2]), tileMinYSimd); VecS32 maxY = vmin(vmax(vmax(vertexY[0], vertexY[1]), vertexY[2]), tileMaxYSimd); //Rasterize each triangle individually for(uint32 lane = 0;lane < numSimdTris;++lane){ float zz[3] = { vertexZ[lane].lane[0],vertexZ[lane].lane[1],vertexZ[lane].lane[2] }; int32 a0 = A0.lane[lane]; int32 a1 = A1.lane[lane]; int32 a2 = A2.lane[lane]; int32 b0 = B0.lane[lane]; int32 b1 = B1.lane[lane]; int32 b2 = B2.lane[lane]; int32 minx = minX.lane[lane]; int32 maxx = maxX.lane[lane]; int32 miny = minY.lane[lane]; int32 maxy = maxY.lane[lane]; auto w0_row = a0 * minx + b0 * miny + C0.lane[lane]; auto w1_row = a1 * minx + b1 * miny + C1.lane[lane]; auto w2_row = a2 * minx + b2 * miny + C2.lane[lane]; float* tilePixels = data_ + tilePos.x*tileSize_.y + (tilePos.y*tileSize_.x)*tileCount_.x; int32 idx2 = minx-tilePos.x + (miny - tilePos.y)*tileSize_.x; int32 spanx = maxx-minx; for(int32 endIdx2 = idx2+(tileSize_.x)*(maxy-miny);idx2<=endIdx2;idx2+=tileSize_.x){ auto w0 = w0_row; auto w1 = w1_row; auto w2 = w2_row; auto idx = idx2; for(int32 endIdx = idx+spanx;idx<=endIdx;++idx){ auto mask = w0|w1|w2; if(mask >= 0){ float betaf = float(w1); float gamaf = float(w2); float depth = zz[0] + betaf*zz[1] + gamaf*zz[2]; auto d = tilePixels[idx]; d = depth<d?depth:d; tilePixels[idx] = d; } w0+=a0; w1+=a1; w2+=a2; } w0_row += b0; w1_row += b1; w2_row += b2; } } } #else for(uint32 i = 0;i<count;i ++){ drawTriangle(faces[i],tilePos); } #endif }
void DepthBuffer::binTriangles4Simd(vec4f vertices[12],uint32 count) { enum { kNumLanes = 4 }; VecF32Soa transformedPos[3]; gather4Simd(transformedPos,(VecF32*)vertices); VecS32 vertexX[3],vertexY[3]; VecF32 vertexZ[3]; for(int i = 0;i<3;i++){ //Convert the floating point coordinates to integer screen space coordinates. //NB: truncate vertexX[i] = ftoi(transformedPos[i].x); vertexY[i] = ftoi(transformedPos[i].y); vertexZ[i] = transformedPos[i].z; } //Compute triangle area. VecS32 area = (vertexX[1] - vertexX[0]) * (vertexY[2] - vertexY[0]) - (vertexX[0] - vertexX[2]) * (vertexY[0] - vertexY[1]); VecF32 oneOverArea = VecF32(1.0f)/itof(area); //Setup Z for interpolation vertexZ[1] = (vertexZ[1] - vertexZ[0]) * oneOverArea; vertexZ[2] = (vertexZ[2] - vertexZ[0]) * oneOverArea; //Find bounding box for the screen space triangle VecS32 zero = VecS32(0); VecS32 minX = vmax( vmin(vmin(vertexX[0],vertexX[1]),vertexX[2]), zero); VecS32 maxX = vmin( vmax(vmax(vertexX[0],vertexX[1]),vertexX[2]), VecS32(size_.x-1) ); VecS32 minY = vmax( vmin(vmin(vertexY[0],vertexY[1]),vertexY[2]), zero); VecS32 maxY = vmin( vmax(vmax(vertexY[0],vertexY[1]),vertexY[2]), VecS32(size_.y-1) ); uint32 numLanes = std::min(count,uint32(kNumLanes)); for(uint32 i =0;i<numLanes;++i){ //Skip triangle if the area is zero if(area.lane[i] <= 0) continue; float oneOverW[3]; for(int j = 0;j<3;++j){ oneOverW[j] = transformedPos[j].w.lane[i]; } // Reject the triangle if any of its verts is behind the nearclip plane if(oneOverW[0] == 0.0f || oneOverW[1] == 0.0f || oneOverW[2] == 0.0f) continue; //Convert bounding box in terms of pixels to bounding box in terms of tiles. int32 tileMinX = minX.lane[i]/tileSize_.x;//std::max(minX.lane[i]/tileSize_.x,0); int32 tileMaxX = maxX.lane[i]/tileSize_.x;//std::min(maxX.lane[i]/tileSize_.x,tileCount_.x); int32 tileMinY = minY.lane[i]/tileSize_.y;//std::max(minY.lane[i]/tileSize_.y,0); int32 tileMaxY = maxY.lane[i]/tileSize_.y;//std::min(maxY.lane[i]/tileSize_.y,tileCount_.y); for(;tileMinY <= tileMaxY;tileMinY++){ auto tileIndex = tileMinX + tileMinY*tileCount_.x; for(auto x = tileMinX; x<= tileMaxX; x++,tileIndex++){ auto count = tileTriangleCount_[tileIndex]; if(count >= kMaxTrianglesPerTile) continue; tileTriangleCount_[tileIndex]++; BinnedTriangle& triangle =*( triangleBins_ + count + x*kMaxTrianglesPerTile + tileMinY*tileCount_.x*kMaxTrianglesPerTile); triangle.v[0].x = vertexX[0].lane[i]; triangle.v[0].y = vertexY[0].lane[i]; triangle.v[1].x = vertexX[1].lane[i]; triangle.v[1].y = vertexY[1].lane[i]; triangle.v[2].x = vertexX[2].lane[i]; triangle.v[2].y = vertexY[2].lane[i]; triangle.z[0] = vertexZ[0].lane[i]; triangle.z[1] = vertexZ[1].lane[i]; triangle.z[2] = vertexZ[2].lane[i]; } } } }
VecF32 DistributionMultiVariateArithmeticDivision::randomVariable()const{ std::cerr<<"No random variable for addition"<<std::endl; return VecF32(); }