/// @copydoc OptimisedUtil::calculateLightFacing
        virtual void calculateLightFacing(
            const Vector4& lightPos,
            const Vector4* faceNormals,
            char* lightFacings,
            size_t numFaces)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->calculateLightFacing(
                lightPos,
                faceNormals,
                lightFacings,
                numFaces);
            profile.end();

            //
            //   Dagon SkeletonAnimation sample test results (CPU timestamp per-function call):
            //
            //                  Pentium 4 3.0G HT       Athlon XP 2500+
            //
            //      General     171875                  86998
            //      SSE          47934                  63995
            //

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
        /// @copydoc OptimisedUtil::calculateFaceNormals
        virtual void calculateFaceNormals(
            const float *positions,
            const EdgeData::Triangle *triangles,
            Vector4 *faceNormals,
            size_t numTriangles)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->calculateFaceNormals(
                positions,
                triangles,
                faceNormals,
                numTriangles);
            profile.end();

            //
            //   Dagon SkeletonAnimation sample test results (CPU timestamp per-function call):
            //
            //                  Pentium 4 3.0G HT       Athlon XP 2500+
            //
            //      General     657080                  486494
            //      SSE         223559                  399495
            //

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
        virtual void softwareVertexMorph(
            Real t,
            const float *srcPos1, const float *srcPos2,
            float *dstPos,
			size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 
            size_t numVertices,
			bool morphNormals)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->softwareVertexMorph(
                t,
                srcPos1, srcPos2,
                dstPos,
				pos1VSize, pos2VSize, dstVSize,
                numVertices,
				morphNormals);
            profile.end();

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
        virtual void softwareVertexSkinning(
            const float *srcPosPtr, float *destPosPtr,
            const float *srcNormPtr, float *destNormPtr,
            const float *blendWeightPtr, const unsigned char* blendIndexPtr,
            const Matrix4* const* blendMatrices,
            size_t srcPosStride, size_t destPosStride,
            size_t srcNormStride, size_t destNormStride,
            size_t blendWeightStride, size_t blendIndexStride,
            size_t numWeightsPerVertex,
            size_t numVertices)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->softwareVertexSkinning(
                srcPosPtr, destPosPtr,
                srcNormPtr, destNormPtr,
                blendWeightPtr, blendIndexPtr,
                blendMatrices,
                srcPosStride, destPosStride,
                srcNormStride, destNormStride,
                blendWeightStride, blendIndexStride,
                numWeightsPerVertex,
                numVertices);
            profile.end();

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
        virtual void extrudeVertices(
            const Vector4& lightPos,
            Real extrudeDist,
            const float* srcPositions,
            float* destPositions,
            size_t numVertices)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->extrudeVertices(
                lightPos,
                extrudeDist,
                srcPositions,
                destPositions,
                numVertices);
            profile.end();

            //
            //   Dagon SkeletonAnimation sample test results (CPU timestamp per-function call):
            //
            //                                  Pentium 4 3.0G HT   Athlon XP 2500+
            //
            //      Directional Light, General   38106               92306
            //      Directional Light, SSE       27292               67055
            //
            //      Point Light, General        224209              155483
            //      Point Light, SSE             56817              106663
            //

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
        virtual void concatenateAffineMatrices(
            const Matrix4& baseMatrix,
            const Matrix4* srcMatrices,
            Matrix4* dstMatrices,
            size_t numMatrices)
        {
            static ProfileItems results;
            static size_t index;
            index = Root::getSingleton().getNextFrameNumber() % mOptimisedUtils.size();
            OptimisedUtil* impl = mOptimisedUtils[index];
            ProfileItem& profile = results[index];

            profile.begin();
            impl->concatenateAffineMatrices(
                baseMatrix,
                srcMatrices,
                dstMatrices,
                numMatrices);
            profile.end();

            // You can put break point here while running test application, to
            // watch profile results.
            ++index;    // So we can put break point here even if in release build
        }
static MeshPtr importObject(QDataStream &stream)
{
    using namespace Ogre;

    QVector4D bbMin, bbMax;
    stream >> bbMin >> bbMax;

    float distance, distanceSquared; // Here's a bug for you: writes "double"'s instead of floats
    stream >> distanceSquared >> distance;

    MeshPtr ogreMesh = MeshManager::getSingleton().createManual("conversion",
                                                               ResourceGroupManager::DEFAULT_RESOURCE_GROUP_NAME);

    int vertexCount, indexCount;
    stream >> vertexCount >> indexCount;

    VertexData *vertexData = new VertexData();
    ogreMesh->sharedVertexData = vertexData;

    LogManager::getSingleton().logMessage("Reading geometry...");
    VertexDeclaration* decl = vertexData->vertexDeclaration;
    VertexBufferBinding* bind = vertexData->vertexBufferBinding;
    unsigned short bufferId = 0;

    // Information for calculating bounds
    Vector3 min = Vector3::ZERO, max = Vector3::UNIT_SCALE, pos = Vector3::ZERO;
    Real maxSquaredRadius = -1;
    bool firstVertex = true;

    /*
      Create a vertex definition for our buffer
      */
    size_t offset = 0;

    const VertexElement &positionElement = decl->addElement(bufferId, offset, VET_FLOAT3, VES_POSITION);
    offset += VertexElement::getTypeSize(VET_FLOAT3);

    const VertexElement &normalElement = decl->addElement(bufferId, offset, VET_FLOAT3, VES_NORMAL);
    offset += VertexElement::getTypeSize(VET_FLOAT3);

    // calculate how many vertexes there actually are
    vertexData->vertexCount = vertexCount;

    // Now create the vertex buffer
    HardwareVertexBufferSharedPtr vbuf = HardwareBufferManager::getSingleton().
            createVertexBuffer(offset, vertexData->vertexCount,
                               HardwareBuffer::HBU_STATIC_WRITE_ONLY, false);

    // Bind it
    bind->setBinding(bufferId, vbuf);

    // Lock it
    unsigned char *pVert = static_cast<unsigned char*>(vbuf->lock(HardwareBuffer::HBL_DISCARD));
    unsigned char *pVertStart = pVert;

    QVector<float> positions;
    positions.reserve(vertexCount * 3);

    // Iterate over all children (vertexbuffer entries)
    for (int i = 0; i < vertexCount; ++i) {
        float *pFloat;

        QVector4D vertex;
        stream >> vertex;
		vertex.setZ(vertex.z() * -1);

        /* Copy over the position */
        positionElement.baseVertexPointerToElement(pVert, &pFloat);
        *(pFloat++) = (float)vertex.x();
        *(pFloat++) = (float)vertex.y();
        *(pFloat++) = (float)vertex.z();

        positions.append(vertex.x());
        positions.append(vertex.y());
        positions.append(vertex.z());

        /* While we're at it, calculate the bounding sphere */
        pos.x = vertex.x();
        pos.y = vertex.y();
        pos.z = vertex.z();

        if (firstVertex) {
            min = max = pos;
            maxSquaredRadius = pos.squaredLength();
            firstVertex = false;
        } else {
            min.makeFloor(pos);
            max.makeCeil(pos);
            maxSquaredRadius = qMax(pos.squaredLength(), maxSquaredRadius);
        }

        pVert += vbuf->getVertexSize();
    }

    // Set bounds
    const AxisAlignedBox& currBox = ogreMesh->getBounds();
    Real currRadius = ogreMesh->getBoundingSphereRadius();
    if (currBox.isNull())
    {
        //do not pad the bounding box
        ogreMesh->_setBounds(AxisAlignedBox(min, max), false);
        ogreMesh->_setBoundingSphereRadius(Math::Sqrt(maxSquaredRadius));
    }
    else
    {
        AxisAlignedBox newBox(min, max);
        newBox.merge(currBox);
        //do not pad the bounding box
        ogreMesh->_setBounds(newBox, false);
        ogreMesh->_setBoundingSphereRadius(qMax(Math::Sqrt(maxSquaredRadius), currRadius));
    }

    /*
       Create faces
     */
    // All children should be submeshes
    SubMesh* sm = ogreMesh->createSubMesh();
    sm->setMaterialName("clippingMaterial");
    sm->operationType = RenderOperation::OT_TRIANGLE_LIST;
    sm->useSharedVertices = true;

    // tri list
    sm->indexData->indexCount = indexCount;

    // Allocate space
    HardwareIndexBufferSharedPtr ibuf = HardwareBufferManager::getSingleton().
            createIndexBuffer(
                HardwareIndexBuffer::IT_16BIT,
                sm->indexData->indexCount,
                HardwareBuffer::HBU_DYNAMIC,
                false);
    sm->indexData->indexBuffer = ibuf;

    unsigned short *pShort = static_cast<unsigned short*>(ibuf->lock(HardwareBuffer::HBL_DISCARD));

    QVector<EdgeData::Triangle> triangles(indexCount / 3);

    for (int i = 0; i < indexCount / 3; ++i) {
        quint16 i1, i2, i3;

        stream >> i1 >> i2 >> i3;
        *pShort++ = i1;
        *pShort++ = i2;
        *pShort++ = i3;

        triangles[i].vertIndex[0] = i1;
        triangles[i].vertIndex[1] = i2;
        triangles[i].vertIndex[2] = i3;

    }

    /* Recalculate the vertex normals */
    Vector4 *faceNormals = (Vector4*)_aligned_malloc(sizeof(Vector4) * triangles.size(), 16);

    OptimisedUtil *util = OptimisedUtil::getImplementation();
    util->calculateFaceNormals(positions.constData(),
                               triangles.data(),
                               faceNormals,
                               indexCount / 3);

	 // Iterate over all children (vertexbuffer entries)
	pVert = pVertStart;
    for (int i = 0; i < vertexCount; ++i) {
        float *pFloat;

		Vector3 normal = Vector3::ZERO;
		
		int count = 0;

		/* Search for all faces that use this vertex */
		for (int j = 0; j < triangles.size(); ++j) {
			if (triangles[j].vertIndex[0] == i 
				|| triangles[j].vertIndex[1] == i 
				|| triangles[j].vertIndex[2] == i) {
				normal.x += faceNormals[j].x / faceNormals[j].w;
				normal.y += faceNormals[j].y / faceNormals[j].w;
				normal.z += faceNormals[j].z / faceNormals[j].w;
				count++;
			}
		}

		normal.normalise();

        /* Copy over the position */
		normalElement.baseVertexPointerToElement(pVert, &pFloat);
        *(pFloat++) = normal.x;
        *(pFloat++) = normal.y;
        *(pFloat++) = normal.z;
		
        pVert += vbuf->getVertexSize();
    }

    _aligned_free(faceNormals);

    vbuf->unlock();
    ibuf->unlock();

    return ogreMesh;
}