void OsdCpuComputeLoopVertexB( real *vertex, real *varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const real *V_W, int vertexOffset, int tableOffset, int start, int end) { if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeLoopVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeLoopVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } else { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; real weight = V_W[i]; real wp = 1.0f/static_cast<real>(n); real beta = 0.25f * cosf(static_cast<real>(M_PI) * 2.0f * wp) + 0.375f; beta = beta * beta; beta = (0.625f - beta) * wp; int dstIndex = i + vertexOffset - tableOffset; clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); addWithWeight(vertexResults, vertex, p, weight * (1.0f - (beta * n)), vertexDesc); for (int j = 0; j < n; ++j) addWithWeight(vertexResults, vertex, V_IT[h+j], weight * beta, vertexDesc); addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } } }
void OsdCpuComputeVertexB( real *vertex, real *varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const int *V_IT, const real *V_W, int vertexOffset, int tableOffset, int start, int end) { if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, tableOffset, start, end); } else { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = V_ITa[5*i]; int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; real weight = V_W[i]; real wp = 1.0f/static_cast<real>(n*n); real wv = (n-2.0f) * n * wp; int dstIndex = i + vertexOffset - tableOffset; clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); addWithWeight(vertexResults, vertex, p, weight * wv, vertexDesc); for (int j = 0; j < n; ++j) { addWithWeight(vertexResults, vertex, V_IT[h+j*2], weight * wp, vertexDesc); addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], weight * wp, vertexDesc); } addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } } }
void OsdCpuComputeFace( real * vertex, real * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset, int start, int end) { if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeFaceKernel<4> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start, end); } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeFaceKernel<8> (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start, end); } else { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start + tableOffset; i < end + tableOffset; i++) { int h = F_ITa[2*i]; int n = F_ITa[2*i+1]; real weight = 1.0f/n; int dstIndex = i + vertexOffset - tableOffset; // clear clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); // accum for (int j = 0; j < n; ++j) { int index = F_IT[h+j]; addWithWeight(vertexResults, vertex, index, weight, vertexDesc); addWithWeight(varyingResults, varying, index, weight, varyingDesc); } // write results copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } } }
void OsdCpuComputeVertexA( real *vertex, real *varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *V_ITa, const real *V_W, int vertexOffset, int tableOffset, int start, int end, int pass) { if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeVertexAKernel<4>(vertex, V_ITa, V_W, vertexOffset, tableOffset, start, end, pass); } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeVertexAKernel<8>(vertex, V_ITa, V_W, vertexOffset, tableOffset, start, end, pass); } else { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start + tableOffset; i < end + tableOffset; i++) { int n = V_ITa[5*i+1]; int p = V_ITa[5*i+2]; int eidx0 = V_ITa[5*i+3]; int eidx1 = V_ITa[5*i+4]; real weight = (pass == 1) ? V_W[i] : 1.0f - V_W[i]; // In the case of fractional weight, the weight must be inverted since // the value is shared with the k_Smooth kernel (statistically the // k_Smooth kernel runs much more often than this one) if (weight > 0.0f && weight < 1.0f && n > 0) weight = 1.0f - weight; int dstIndex = i + vertexOffset - tableOffset; clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); if (pass) { // copy previous results addWithWeight(vertexResults, vertex, dstIndex, 1.0f, vertexDesc); } if (eidx0 == -1 || (pass == 0 && (n == -1))) { addWithWeight(vertexResults, vertex, p, weight, vertexDesc); } else { addWithWeight(vertexResults, vertex, p, weight * 0.75f, vertexDesc); addWithWeight(vertexResults, vertex, eidx0, weight * 0.125f, vertexDesc); addWithWeight(vertexResults, vertex, eidx1, weight * 0.125f, vertexDesc); } copy(vertex, vertexResults, dstIndex, vertexDesc); if (not pass) { addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } } } }
void OsdCpuComputeBilinearEdge( real *vertex, real *varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *E_IT, int vertexOffset, int tableOffset, int start, int end) { if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) { ComputeBilinearEdgeKernel<4>(vertex, E_IT, vertexOffset, tableOffset, start, end); } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) { ComputeBilinearEdgeKernel<8>(vertex, E_IT, vertexOffset, tableOffset, start, end); } else { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start + tableOffset; i < end + tableOffset; i++) { int eidx0 = E_IT[2*i+0]; int eidx1 = E_IT[2*i+1]; int dstIndex = i + vertexOffset - tableOffset; clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); addWithWeight(vertexResults, vertex, eidx0, 0.5f, vertexDesc); addWithWeight(vertexResults, vertex, eidx1, 0.5f, vertexDesc); addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc); addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc); copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } } }
void CpuComputeStencils(VertexBufferDescriptor const &vertexDesc, float const * vertexSrc, float * vertexDst, unsigned char const * sizes, int const * offsets, int const * indices, float const * weights, int start, int end) { assert(start>=0 and start<end); if (start>0) { sizes += start; indices += offsets[start]; weights += offsets[start]; } if (vertexDesc.length==4 and vertexDesc.stride==4) { // SIMD fast path for aligned primvar data (8 floats) ComputeStencilKernel<4>(vertexSrc, vertexDst, sizes, indices, weights, start, end); } else if(vertexDesc.length==8 and vertexDesc.stride==8) { // SIMD fast path for aligned primvar data (8 floats) ComputeStencilKernel<8>(vertexSrc, vertexDst, sizes, indices, weights, start, end); } else { // Slow path for non-aligned data float * result = (float*)alloca(vertexDesc.length * sizeof(float)); int nstencils = end-start; for (int i=0; i<nstencils; ++i, ++sizes) { clear(result, vertexDesc); for (int j=0; j<*sizes; ++j) { addWithWeight(result, vertexSrc, *indices++, *weights++, vertexDesc); } copy(vertexDst, i, result, vertexDesc); } } }
void operator() (tbb::blocked_range<int> const &r) const { #define USE_SIMD #ifdef USE_SIMD if (_srcDesc.length==4 and _srcDesc.stride==4 and _dstDesc.stride==4) { // SIMD fast path for aligned primvar data (4 floats) int offset = _offsets[r.begin()]; ComputeStencilKernel<4>(_vertexSrc, _vertexDst, _sizes, _indices+offset, _weights+offset, r.begin(), r.end()); } else if (_srcDesc.length==8 and _srcDesc.stride==4 and _dstDesc.stride==4) { // SIMD fast path for aligned primvar data (8 floats) int offset = _offsets[r.begin()]; ComputeStencilKernel<8>(_vertexSrc, _vertexDst, _sizes, _indices+offset, _weights+offset, r.begin(), r.end()); } else { #else { #endif int const * sizes = _sizes; int const * indices = _indices; float const * weights = _weights; if (r.begin()>0) { sizes += r.begin(); indices += _offsets[r.begin()]; weights += _offsets[r.begin()]; } // Slow path for non-aligned data float * result = (float*)alloca(_srcDesc.length * sizeof(float)); for (int i=r.begin(); i<r.end(); ++i, ++sizes) { clear(result, _dstDesc); for (int j=0; j<*sizes; ++j) { addWithWeight(result, _vertexSrc, *indices++, *weights++, _srcDesc); } copy(_vertexDst, i, result, _dstDesc); } } } };
void OsdCpuComputeTriQuadFace( real * vertex, real * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, int vertexOffset, int tableOffset, int start, int end) { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start; i < end; i++) { int fidx0 = F_IT[tableOffset + 4 * i + 0]; int fidx1 = F_IT[tableOffset + 4 * i + 1]; int fidx2 = F_IT[tableOffset + 4 * i + 2]; int fidx3 = F_IT[tableOffset + 4 * i + 3]; bool triangle = (fidx2 == fidx3); real weight = (triangle ? 1.0f / 3.0f : 1.0f / 4.0f); int dstIndex = i + vertexOffset; // clear clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); // accum addWithWeight(vertexResults, vertex, fidx0, weight, vertexDesc); addWithWeight(vertexResults, vertex, fidx1, weight, vertexDesc); addWithWeight(vertexResults, vertex, fidx2, weight, vertexDesc); addWithWeight(varyingResults, varying, fidx0, weight, varyingDesc); addWithWeight(varyingResults, varying, fidx1, weight, varyingDesc); addWithWeight(varyingResults, varying, fidx2, weight, varyingDesc); if (!triangle) { addWithWeight(vertexResults, vertex, fidx3, weight, vertexDesc); addWithWeight(varyingResults, varying, fidx3, weight, varyingDesc); } // write results copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } }
// XXXX manuelk this should be optimized further by using SIMD - considering // OMP is somewhat obsolete - this is probably not worth it. void OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc, float * dst, BufferDescriptor const &dstDesc, int const * sizes, int const * offsets, int const * indices, float const * weights, int start, int end) { start = (start > 0 ? start : 0); src += srcDesc.offset; dst += dstDesc.offset; int numThreads = omp_get_max_threads(); int n = end - start; float * result = (float*)alloca(srcDesc.length * numThreads * sizeof(float)); #pragma omp parallel for for (int i = 0; i < n; ++i) { int index = i + start; // Stencil index // Get thread-local pointers int const * threadIndices = indices + offsets[index]; float const * threadWeights = weights + offsets[index]; int threadId = omp_get_thread_num(); float * threadResult = result + threadId*srcDesc.length; clear(threadResult, dstDesc); for (int j=0; j<(int)sizes[index]; ++j) { addWithWeight(threadResult, src, threadIndices[j], threadWeights[j], srcDesc); } copy(dst, i, threadResult, dstDesc); } }
// XXXX manuelk this should be optimized further by using SIMD - considering // OMP is somewhat obsolete - this is probably not worth it. void OmpComputeStencils(VertexBufferDescriptor const &vertexDesc, float const * vertexSrc, float * vertexDst, unsigned char const * sizes, int const * offsets, int const * indices, float const * weights, int start, int end) { assert(start>=0 and start<end); int numThreads = omp_get_max_threads(), nstencils = end-start; float * result = (float*)alloca(vertexDesc.length*numThreads*sizeof(float)); #pragma omp parallel for for (int i=0; i<nstencils; ++i) { int index = i + (start>0 ? start : 0); // Stencil index // Get thread-local pointers int const * threadIndices = indices + offsets[index]; float const * threadWeights = weights + offsets[index]; int threadId = omp_get_thread_num(); float * threadResult = result + threadId*vertexDesc.length; clear(threadResult, vertexDesc); for (int j=0; j<(int)sizes[index]; ++j) { addWithWeight(threadResult, vertexSrc, threadIndices[j], threadWeights[j], vertexDesc); } copy(vertexDst, i, threadResult, vertexDesc); } }
void OsdCpuComputeQuadFace( real * vertex, real * varying, OsdVertexBufferDescriptor const &vertexDesc, OsdVertexBufferDescriptor const &varyingDesc, const int *F_IT, int vertexOffset, int tableOffset, int start, int end) { real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real)); real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real)); for (int i = start; i < end; i++) { int fidx0 = F_IT[tableOffset + 4 * i + 0]; int fidx1 = F_IT[tableOffset + 4 * i + 1]; int fidx2 = F_IT[tableOffset + 4 * i + 2]; int fidx3 = F_IT[tableOffset + 4 * i + 3]; int dstIndex = i + vertexOffset; // clear clear(vertexResults, vertexDesc); clear(varyingResults, varyingDesc); // accum addWithWeight(vertexResults, vertex, fidx0, 0.25f, vertexDesc); addWithWeight(vertexResults, vertex, fidx1, 0.25f, vertexDesc); addWithWeight(vertexResults, vertex, fidx2, 0.25f, vertexDesc); addWithWeight(vertexResults, vertex, fidx3, 0.25f, vertexDesc); addWithWeight(varyingResults, varying, fidx0, 0.25f, varyingDesc); addWithWeight(varyingResults, varying, fidx1, 0.25f, varyingDesc); addWithWeight(varyingResults, varying, fidx2, 0.25f, varyingDesc); addWithWeight(varyingResults, varying, fidx3, 0.25f, varyingDesc); // write results copy(vertex, vertexResults, dstIndex, vertexDesc); copy(varying, varyingResults, dstIndex, varyingDesc); } }