예제 #1
0
void OsdCpuComputeLoopVertexB(
    real *vertex, real *varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *V_ITa, const int *V_IT, const real *V_W,
    int vertexOffset, int tableOffset, int start, int end) {
    if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) {
        ComputeLoopVertexBKernel<4>(vertex, V_ITa, V_IT, V_W, vertexOffset, 
                              tableOffset, start, end);
    }
    else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) {
        ComputeLoopVertexBKernel<8>(vertex, V_ITa, V_IT, V_W, vertexOffset, 
                              tableOffset, start, end);    
    }    
    else {
        real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
        real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

        for (int i = start + tableOffset; i < end + tableOffset; i++) {
            int h = V_ITa[5*i];
            int n = V_ITa[5*i+1];
            int p = V_ITa[5*i+2];

            real weight = V_W[i];
            real wp = 1.0f/static_cast<real>(n);
            real beta = 0.25f * cosf(static_cast<real>(M_PI) * 2.0f * wp) + 0.375f;
            beta = beta * beta;
            beta = (0.625f - beta) * wp;

            int dstIndex = i + vertexOffset - tableOffset;
            clear(vertexResults, vertexDesc);
            clear(varyingResults, varyingDesc);

            addWithWeight(vertexResults, vertex, p, weight * (1.0f - (beta * n)), vertexDesc);

            for (int j = 0; j < n; ++j)
                addWithWeight(vertexResults, vertex, V_IT[h+j], weight * beta, vertexDesc);

            addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc);

            copy(vertex, vertexResults, dstIndex, vertexDesc);
            copy(varying, varyingResults, dstIndex, varyingDesc);
        }
    }
}
예제 #2
0
void OsdCpuComputeVertexB(
    real *vertex, real *varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *V_ITa, const int *V_IT, const real *V_W,
    int vertexOffset, int tableOffset, int start, int end) {
    if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) {
        ComputeVertexBKernel<4>(vertex, V_ITa, V_IT, V_W,
            vertexOffset, tableOffset, start, end);
    }
    else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) {
        ComputeVertexBKernel<8>(vertex, V_ITa, V_IT, V_W,
            vertexOffset, tableOffset, start, end);
    }
    else {
        real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
        real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

        for (int i = start + tableOffset; i < end + tableOffset; i++) {
            int h = V_ITa[5*i];
            int n = V_ITa[5*i+1];
            int p = V_ITa[5*i+2];

            real weight = V_W[i];
            real wp = 1.0f/static_cast<real>(n*n);
            real wv = (n-2.0f) * n * wp;

            int dstIndex = i + vertexOffset - tableOffset;
            clear(vertexResults, vertexDesc);
            clear(varyingResults, varyingDesc);

            addWithWeight(vertexResults, vertex, p, weight * wv, vertexDesc);

            for (int j = 0; j < n; ++j) {
                addWithWeight(vertexResults, vertex, V_IT[h+j*2], weight * wp, vertexDesc);
                addWithWeight(vertexResults, vertex, V_IT[h+j*2+1], weight * wp, vertexDesc);
            }
            addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc);

            copy(vertex, vertexResults, dstIndex, vertexDesc);
            copy(varying, varyingResults, dstIndex, varyingDesc);
        }
    }
}
예제 #3
0
void OsdCpuComputeFace(
    real * vertex, real * varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *F_IT, const int *F_ITa, int vertexOffset, int tableOffset,
    int start, int end) {
    if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) {
        ComputeFaceKernel<4>
            (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start,  end);
    } else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) {
        ComputeFaceKernel<8>
            (vertex, F_IT, F_ITa, vertexOffset, tableOffset, start,  end);
    }
    else {
        real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
        real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

        for (int i = start + tableOffset; i < end + tableOffset; i++) {
            int h = F_ITa[2*i];
            int n = F_ITa[2*i+1];

            real weight = 1.0f/n;
            int dstIndex = i + vertexOffset - tableOffset;

            // clear
            clear(vertexResults, vertexDesc);
            clear(varyingResults, varyingDesc);

            // accum
            for (int j = 0; j < n; ++j) {
                int index = F_IT[h+j];
                addWithWeight(vertexResults, vertex, index, weight, vertexDesc);
                addWithWeight(varyingResults, varying, index, weight, varyingDesc);
            }

            // write results
            copy(vertex, vertexResults, dstIndex, vertexDesc);
            copy(varying, varyingResults, dstIndex, varyingDesc);
        }
    }
}
예제 #4
0
void OsdCpuComputeVertexA(
    real *vertex, real *varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *V_ITa, const real *V_W, int vertexOffset, int tableOffset,
    int start, int end, int pass) {
    if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) {
        ComputeVertexAKernel<4>(vertex, V_ITa, V_W, vertexOffset, tableOffset,
                             start, end, pass);
    }
    else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) {
        ComputeVertexAKernel<8>(vertex, V_ITa, V_W, vertexOffset, tableOffset,
                             start, end, pass);
    }
    else {
        real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
        real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

        for (int i = start + tableOffset; i < end + tableOffset; i++) {
            int n     = V_ITa[5*i+1];
            int p     = V_ITa[5*i+2];
            int eidx0 = V_ITa[5*i+3];
            int eidx1 = V_ITa[5*i+4];

            real weight = (pass == 1) ? V_W[i] : 1.0f - V_W[i];

            // In the case of fractional weight, the weight must be inverted since
            // the value is shared with the k_Smooth kernel (statistically the
            // k_Smooth kernel runs much more often than this one)
            if (weight > 0.0f && weight < 1.0f && n > 0)
                weight = 1.0f - weight;

            int dstIndex = i + vertexOffset - tableOffset;

            clear(vertexResults, vertexDesc);
            clear(varyingResults, varyingDesc);
            if (pass) {
                // copy previous results
                addWithWeight(vertexResults, vertex, dstIndex, 1.0f, vertexDesc);
            }

            if (eidx0 == -1 || (pass == 0 && (n == -1))) {
                addWithWeight(vertexResults, vertex, p, weight, vertexDesc);
            } else {
                addWithWeight(vertexResults, vertex, p, weight * 0.75f, vertexDesc);
                addWithWeight(vertexResults, vertex, eidx0, weight * 0.125f, vertexDesc);
                addWithWeight(vertexResults, vertex, eidx1, weight * 0.125f, vertexDesc);
            }

            copy(vertex, vertexResults, dstIndex, vertexDesc);
            if (not pass) {
                addWithWeight(varyingResults, varying, p, 1.0f, varyingDesc);
                copy(varying, varyingResults, dstIndex, varyingDesc);
            }
        }
    }
}
예제 #5
0
void OsdCpuComputeBilinearEdge(
    real *vertex, real *varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *E_IT, int vertexOffset, int tableOffset, int start, int end) {
    if(vertexDesc == OsdVertexBufferDescriptor(0, 4, 4) && varying == NULL) {
        ComputeBilinearEdgeKernel<4>(vertex, E_IT, vertexOffset, tableOffset, 
                                     start, end);
    }
    else if(vertexDesc == OsdVertexBufferDescriptor(0, 8, 8) && varying == NULL) {
        ComputeBilinearEdgeKernel<8>(vertex, E_IT, vertexOffset, tableOffset, 
                                     start, end);      
    }
    else {
        real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
        real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

        for (int i = start + tableOffset; i < end + tableOffset; i++) {
            int eidx0 = E_IT[2*i+0];
            int eidx1 = E_IT[2*i+1];

            int dstIndex = i + vertexOffset - tableOffset;
            clear(vertexResults, vertexDesc);
            clear(varyingResults, varyingDesc);

            addWithWeight(vertexResults, vertex, eidx0, 0.5f, vertexDesc);
            addWithWeight(vertexResults, vertex, eidx1, 0.5f, vertexDesc);

            addWithWeight(varyingResults, varying, eidx0, 0.5f, varyingDesc);
            addWithWeight(varyingResults, varying, eidx1, 0.5f, varyingDesc);

            copy(vertex, vertexResults, dstIndex, vertexDesc);
            copy(varying, varyingResults, dstIndex, varyingDesc);
        }
    }
}
예제 #6
0
void
CpuComputeStencils(VertexBufferDescriptor const &vertexDesc,
                   float const * vertexSrc,
                   float * vertexDst,
                   unsigned char const * sizes,
                   int const * offsets,
                   int const * indices,
                   float const * weights,
                   int start, int end) {

    assert(start>=0 and start<end);

    if (start>0) {
        sizes += start;
        indices += offsets[start];
        weights += offsets[start];
    }

    if (vertexDesc.length==4 and vertexDesc.stride==4) {

        // SIMD fast path for aligned primvar data (8 floats)
        ComputeStencilKernel<4>(vertexSrc, vertexDst,
            sizes, indices, weights, start,  end);

    } else if(vertexDesc.length==8 and vertexDesc.stride==8) {

        // SIMD fast path for aligned primvar data (8 floats)
        ComputeStencilKernel<8>(vertexSrc, vertexDst,
            sizes, indices, weights, start,  end);
    }
    else {

        // Slow path for non-aligned data
        float * result = (float*)alloca(vertexDesc.length * sizeof(float));

        int nstencils = end-start;
        for (int i=0; i<nstencils; ++i, ++sizes) {

            clear(result, vertexDesc);

            for (int j=0; j<*sizes; ++j) {
                addWithWeight(result, vertexSrc, *indices++, *weights++, vertexDesc);
            }

            copy(vertexDst, i, result, vertexDesc);
        }
    }
}
예제 #7
0
    void operator() (tbb::blocked_range<int> const &r) const {
#define USE_SIMD
#ifdef USE_SIMD
        if (_srcDesc.length==4 and _srcDesc.stride==4 and _dstDesc.stride==4) {

            // SIMD fast path for aligned primvar data (4 floats)
            int offset = _offsets[r.begin()];
            ComputeStencilKernel<4>(_vertexSrc, _vertexDst,
                _sizes, _indices+offset, _weights+offset, r.begin(), r.end());

        } else if (_srcDesc.length==8 and _srcDesc.stride==4 and _dstDesc.stride==4) {

            // SIMD fast path for aligned primvar data (8 floats)
            int offset = _offsets[r.begin()];
            ComputeStencilKernel<8>(_vertexSrc, _vertexDst,
                _sizes, _indices+offset, _weights+offset, r.begin(), r.end());

        } else {
#else
        {
#endif
            int const * sizes = _sizes;
            int const * indices = _indices;
            float const * weights = _weights;

            if (r.begin()>0) {
                sizes += r.begin();
                indices += _offsets[r.begin()];
                weights += _offsets[r.begin()];
            }

            // Slow path for non-aligned data
            float * result = (float*)alloca(_srcDesc.length * sizeof(float));

            for (int i=r.begin(); i<r.end(); ++i, ++sizes) {

                clear(result, _dstDesc);

                for (int j=0; j<*sizes; ++j) {
                    addWithWeight(result, _vertexSrc, *indices++, *weights++, _srcDesc);
                }

                copy(_vertexDst, i, result, _dstDesc);
            }
        }
    }
};
예제 #8
0
void OsdCpuComputeTriQuadFace(
    real * vertex, real * varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *F_IT, int vertexOffset, int tableOffset,
    int start, int end) {

    real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
    real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

    for (int i = start; i < end; i++) {
        int fidx0 = F_IT[tableOffset + 4 * i + 0];
        int fidx1 = F_IT[tableOffset + 4 * i + 1];
        int fidx2 = F_IT[tableOffset + 4 * i + 2];
        int fidx3 = F_IT[tableOffset + 4 * i + 3];
        bool triangle = (fidx2 == fidx3);
        real weight = (triangle ? 1.0f / 3.0f : 1.0f / 4.0f);

        int dstIndex = i + vertexOffset;

        // clear
        clear(vertexResults, vertexDesc);
        clear(varyingResults, varyingDesc);

        // accum
        addWithWeight(vertexResults, vertex, fidx0, weight, vertexDesc);
        addWithWeight(vertexResults, vertex, fidx1, weight, vertexDesc);
        addWithWeight(vertexResults, vertex, fidx2, weight, vertexDesc);
        addWithWeight(varyingResults, varying, fidx0, weight, varyingDesc);
        addWithWeight(varyingResults, varying, fidx1, weight, varyingDesc);
        addWithWeight(varyingResults, varying, fidx2, weight, varyingDesc);
        if (!triangle) {
            addWithWeight(vertexResults, vertex, fidx3, weight, vertexDesc);
            addWithWeight(varyingResults, varying, fidx3, weight, varyingDesc);
        }

        // write results
        copy(vertex, vertexResults, dstIndex, vertexDesc);
        copy(varying, varyingResults, dstIndex, varyingDesc);
    }
}
예제 #9
0
// XXXX manuelk this should be optimized further by using SIMD - considering
//              OMP is somewhat obsolete - this is probably not worth it.
void
OmpEvalStencils(float const * src, BufferDescriptor const &srcDesc,
                float * dst,       BufferDescriptor const &dstDesc,
                int const * sizes,
                int const * offsets,
                int const * indices,
                float const * weights,
                int start, int end) {
    start = (start > 0 ? start : 0);
    
    src += srcDesc.offset;
    dst += dstDesc.offset;

    int numThreads = omp_get_max_threads();
    int n = end - start;

    float * result = (float*)alloca(srcDesc.length * numThreads * sizeof(float));

#pragma omp parallel for
    for (int i = 0; i < n; ++i) {

        int index = i + start; // Stencil index

        // Get thread-local pointers
        int const           * threadIndices = indices + offsets[index];
        float const         * threadWeights = weights + offsets[index];

        int threadId = omp_get_thread_num();

        float * threadResult = result + threadId*srcDesc.length;

        clear(threadResult, dstDesc);

        for (int j=0; j<(int)sizes[index]; ++j) {
            addWithWeight(threadResult, src,
                threadIndices[j], threadWeights[j], srcDesc);
        }

        copy(dst, i, threadResult, dstDesc);
    }
}
예제 #10
0
// XXXX manuelk this should be optimized further by using SIMD - considering
//              OMP is somewhat obsolete - this is probably not worth it.
void
OmpComputeStencils(VertexBufferDescriptor const &vertexDesc,
                      float const * vertexSrc,
                      float * vertexDst,
                      unsigned char const * sizes,
                      int const * offsets,
                      int const * indices,
                      float const * weights,
                      int start, int end) {

    assert(start>=0 and start<end);

    int numThreads = omp_get_max_threads(),
        nstencils = end-start;

    float * result = (float*)alloca(vertexDesc.length*numThreads*sizeof(float));

#pragma omp parallel for
    for (int i=0; i<nstencils; ++i) {

        int index = i + (start>0 ? start : 0); // Stencil index

        // Get thread-local pointers
        int const           * threadIndices = indices + offsets[index];
        float const         * threadWeights = weights + offsets[index];

        int threadId = omp_get_thread_num();

        float * threadResult = result + threadId*vertexDesc.length;

        clear(threadResult, vertexDesc);

        for (int j=0; j<(int)sizes[index]; ++j) {
            addWithWeight(threadResult, vertexSrc,
                threadIndices[j], threadWeights[j], vertexDesc);
        }

        copy(vertexDst, i, threadResult, vertexDesc);
    }

}
예제 #11
0
void OsdCpuComputeQuadFace(
    real * vertex, real * varying,
    OsdVertexBufferDescriptor const &vertexDesc,
    OsdVertexBufferDescriptor const &varyingDesc,
    const int *F_IT, int vertexOffset, int tableOffset,
    int start, int end) {

    real *vertexResults = (real*)alloca(vertexDesc.length * sizeof(real));
    real *varyingResults = (real*)alloca(varyingDesc.length * sizeof(real));

    for (int i = start; i < end; i++) {
        int fidx0 = F_IT[tableOffset + 4 * i + 0];
        int fidx1 = F_IT[tableOffset + 4 * i + 1];
        int fidx2 = F_IT[tableOffset + 4 * i + 2];
        int fidx3 = F_IT[tableOffset + 4 * i + 3];

        int dstIndex = i + vertexOffset;

        // clear
        clear(vertexResults, vertexDesc);
        clear(varyingResults, varyingDesc);

        // accum
        addWithWeight(vertexResults, vertex, fidx0, 0.25f, vertexDesc);
        addWithWeight(vertexResults, vertex, fidx1, 0.25f, vertexDesc);
        addWithWeight(vertexResults, vertex, fidx2, 0.25f, vertexDesc);
        addWithWeight(vertexResults, vertex, fidx3, 0.25f, vertexDesc);
        addWithWeight(varyingResults, varying, fidx0, 0.25f, varyingDesc);
        addWithWeight(varyingResults, varying, fidx1, 0.25f, varyingDesc);
        addWithWeight(varyingResults, varying, fidx2, 0.25f, varyingDesc);
        addWithWeight(varyingResults, varying, fidx3, 0.25f, varyingDesc);

        // write results
        copy(vertex, vertexResults, dstIndex, vertexDesc);
        copy(varying, varyingResults, dstIndex, varyingDesc);
    }
}