//-----------------------------------------------------------------------
	void BaseInstanceBatchVTF::createVertexTexture( const SubMesh* baseSubMesh )
	{
		/*
		TODO: Find a way to retrieve max texture resolution,
		http://www.ogre3d.org/forums/viewtopic.php?t=38305

		Currently assuming it's 4096x4096, which is a safe bet for any hardware with decent VTF*/
		
		size_t uniqueAnimations = mInstancesPerBatch;
		if (useBoneMatrixLookup())
		{
			uniqueAnimations = std::min<size_t>(getMaxLookupTableInstances(), uniqueAnimations);
		}
		mMatricesPerInstance = std::max<size_t>( 1, baseSubMesh->blendIndexToBoneIndexMap.size() );

		if(mUseBoneDualQuaternions && !mTempTransformsArray3x4)
		{
			mTempTransformsArray3x4 = OGRE_ALLOC_T(float, mMatricesPerInstance * 3 * 4, MEMCATEGORY_GENERAL);
		}
    //-----------------------------------------------------------------------
    bool InstanceBatchHW_VTF::checkSubMeshCompatibility( const SubMesh* baseSubMesh )
    {
        //Max number of texture coordinates is _usually_ 8, we need at least 2 available
        unsigned short neededTextureCoord = 2;
        if (useBoneMatrixLookup())
        {
            //we need another 3 for the unique world transform of each instanced entity
            neededTextureCoord += 3;
        }
        if( baseSubMesh->vertexData[VpNormal]->vertexDeclaration->
                getNextFreeTextureCoordinate() > 8 - neededTextureCoord )
        {
            OGRE_EXCEPT(Exception::ERR_NOT_IMPLEMENTED, 
                    String("Given mesh must have at least ") + 
                    StringConverter::toString(neededTextureCoord) + "free TEXCOORDs",
                    "InstanceBatchHW_VTF::checkSubMeshCompatibility");
        }

        return InstanceBatch::checkSubMeshCompatibility( baseSubMesh );
    }
	//updates the vertex buffer containing the per instance data
	size_t InstanceBatchHW_VTF::updateInstanceDataBuffer(bool isFirstTime, Camera* currentCamera)
	{
		size_t visibleEntityCount = 0;
		bool useMatrixLookup = useBoneMatrixLookup();
		if (isFirstTime ^ useMatrixLookup)
		{
			//update the mTransformLookupNumber value in the entities if needed 
			updateSharedLookupIndexes();

			const float texWidth  = static_cast<float>(mMatrixTexture->getWidth());
			const float texHeight = static_cast<float>(mMatrixTexture->getHeight());

			//Calculate the texel offsets to correct them offline
			//Awkwardly enough, the offset is needed in OpenGL too
			Vector2 texelOffsets;
			//RenderSystem *renderSystem = Root::getSingleton().getRenderSystem();
			texelOffsets.x = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texWidth;
			texelOffsets.y = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texHeight;

			float *thisVec = static_cast<float*>(mInstanceVertexBuffer->lock(HardwareBuffer::HBL_DISCARD));

			const size_t maxPixelsPerLine = std::min( mMatrixTexture->getWidth(), mMaxFloatsPerLine >> 2 );

			//Calculate UV offsets, which change per instance
			for( size_t i=0; i<mInstancesPerBatch; ++i )
			{
				InstancedEntity* entity = useMatrixLookup ? mInstancedEntities[i] : NULL;
				if  //Update if we are not using a lookup bone matrix method. In this case the function will 
					//be called only once
					(!useMatrixLookup || 
					//Update if we are in the visible range of the camera (for look up bone matrix method
					//and static mode).
					(entity->findVisible(currentCamera)))
				{
					size_t matrixIndex = useMatrixLookup ? entity->mTransformLookupNumber : i;
					size_t instanceIdx = matrixIndex * mMatricesPerInstance * mRowLength;
					*thisVec = ((instanceIdx % maxPixelsPerLine) / texWidth) - (float)(texelOffsets.x);
					*(thisVec + 1) = ((instanceIdx / maxPixelsPerLine) / texHeight) - (float)(texelOffsets.y);
					thisVec += 2;

					if (useMatrixLookup)
					{
						const Matrix4& mat =  entity->_getParentNodeFullTransform();
						*(thisVec)     = static_cast<float>( mat[0][0] );
						*(thisVec + 1) = static_cast<float>( mat[0][1] );
						*(thisVec + 2) = static_cast<float>( mat[0][2] );
						*(thisVec + 3) = static_cast<float>( mat[0][3] );
						*(thisVec + 4) = static_cast<float>( mat[1][0] );
						*(thisVec + 5) = static_cast<float>( mat[1][1] );
						*(thisVec + 6) = static_cast<float>( mat[1][2] );
						*(thisVec + 7) = static_cast<float>( mat[1][3] );
						*(thisVec + 8) = static_cast<float>( mat[2][0] );
						*(thisVec + 9) = static_cast<float>( mat[2][1] );
						*(thisVec + 10)= static_cast<float>( mat[2][2] );
						*(thisVec + 11)= static_cast<float>( mat[2][3] );
						if(currentCamera && mManager->getCameraRelativeRendering()) // && useMatrixLookup
						{
							const Vector3 &cameraRelativePosition = currentCamera->getDerivedPosition();
							*(thisVec + 3) -= static_cast<float>( cameraRelativePosition.x );
							*(thisVec + 7) -= static_cast<float>( cameraRelativePosition.y );
							*(thisVec + 11) -=  static_cast<float>( cameraRelativePosition.z );
						}
						thisVec += 12;
					}
					++visibleEntityCount;
				}
			}

			mInstanceVertexBuffer->unlock();
		}
	//-----------------------------------------------------------------------
	void InstanceBatchHW_VTF::createVertexSemantics( VertexData *thisVertexData,
														 VertexData *baseVertexData,
														 const HWBoneIdxVec &hwBoneIdx,
														 const HWBoneWgtVec& hwBoneWgt)
	{
		const float texWidth  = static_cast<float>(mMatrixTexture->getWidth());

		//Only one weight per vertex is supported. It would not only be complex, but prohibitively slow.
		//Put them in a new buffer, since it's 16 bytes aligned :-)
		unsigned short newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;

		size_t offset = 0;

		size_t maxFloatsPerVector = 4;

		//Can fit two dual quaternions in every float4, but only one 3x4 matrix
		for(size_t i = 0; i < mWeightCount; i += maxFloatsPerVector / mRowLength)
		{
			offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
										thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
		}

		//Add the weights (supports up to four, which is Ogre's limit)
		if(mWeightCount > 1)
		{
			thisVertexData->vertexDeclaration->addElement(newSource, offset, VET_FLOAT4, VES_BLEND_WEIGHTS,
										0 ).getSize();
		}
		
		//Create our own vertex buffer
		HardwareVertexBufferSharedPtr vertexBuffer =
			HardwareBufferManager::getSingleton().createVertexBuffer(
			thisVertexData->vertexDeclaration->getVertexSize(newSource),
			thisVertexData->vertexCount,
			HardwareBuffer::HBU_STATIC_WRITE_ONLY );
		thisVertexData->vertexBufferBinding->setBinding( newSource, vertexBuffer );

		float *thisFloat = static_cast<float*>(vertexBuffer->lock(HardwareBuffer::HBL_DISCARD));

		//Create the UVs to sample from the right bone/matrix
		for( size_t j=0; j < baseVertexData->vertexCount * mWeightCount; j += mWeightCount)
		{
			size_t numberOfMatricesInLine = 0;
			
			//Write the matrices, adding padding as needed
			for(size_t i = 0; i < mWeightCount; ++i)
			{
				//Write the matrix
				for( size_t k=0; k < mRowLength; ++k)
				{
					//Only calculate U (not V) since all matrices are in the same row. We use the instanced
					//(repeated) buffer to tell how much U & V we need to offset
					size_t instanceIdx = hwBoneIdx[j+i] * mRowLength + k;
					*thisFloat++ = instanceIdx / texWidth;
				}

				++numberOfMatricesInLine;

				//If another matrix can't be fit, we're on another line, or if this is the last weight
				if((numberOfMatricesInLine + 1) * mRowLength > maxFloatsPerVector || (i+1) == mWeightCount)
				{
					//Place zeroes in the remaining coordinates
					for ( size_t k=mRowLength * numberOfMatricesInLine; k < maxFloatsPerVector; ++k)
					{
						*thisFloat++ = 0.0f;
					}

					numberOfMatricesInLine = 0;
				}
			}

			//Don't need to write weights if there is only one
			if(mWeightCount > 1)
			{
				//Write the weights
				for(size_t i = 0; i < mWeightCount; ++i)
				{
					*thisFloat++ = hwBoneWgt[j+i];
				}

				//Write the empty space
				for(size_t i = mWeightCount; i < maxFloatsPerVector; ++i)
				{
					*thisFloat++ = 0.0f;
				}
			}
		}

		vertexBuffer->unlock();

		//Now create the instance buffer that will be incremented per instance, contains UV offsets
		newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;
		offset = thisVertexData->vertexDeclaration->addElement( newSource, 0, VET_FLOAT2, VES_TEXTURE_COORDINATES,
									thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
		if (useBoneMatrixLookup())
		{
			//if using bone matrix lookup we will need to add 3 more float4 to contain the matrix. containing
			//the personal world transform of each entity.
			offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
				thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
			offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
				thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
			thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
				thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
			//Add two floats of padding here? or earlier?
			//If not using bone matrix lookup, is it ok that it is 8 bytes since divides evenly into 16

		}

		//Create our own vertex buffer
		mInstanceVertexBuffer = HardwareBufferManager::getSingleton().createVertexBuffer(
										thisVertexData->vertexDeclaration->getVertexSize(newSource),
										mInstancesPerBatch,
										HardwareBuffer::HBU_STATIC_WRITE_ONLY );
		thisVertexData->vertexBufferBinding->setBinding( newSource, mInstanceVertexBuffer );

		//Mark this buffer as instanced
		mInstanceVertexBuffer->setIsInstanceData( true );
		mInstanceVertexBuffer->setInstanceDataStepRate( 1 );

		updateInstanceDataBuffer(true, NULL);
	}
    //-----------------------------------------------------------------------
    size_t InstanceBatchHW_VTF::updateVertexTexture( Camera *camera, const Camera *lodCamera )
    {
        MovableObjectArray *visibleObjects = 0;
        if( mManager->getInstancingThreadedCullingMethod() == INSTANCING_CULLING_SINGLETHREAD )
        {
            //Perform the culling now
            ObjectData objData;
            const size_t numObjs = mLocalObjectMemoryManager.getFirstObjectData( objData, 0 );

            visibleObjects = &mManager->_getTmpVisibleObjectsList()[0][mRenderQueueID];
            visibleObjects->clear();

            //TODO: Static batches aren't yet supported (camera ptr will be null and crash)
            MovableObject::cullFrustum( numObjs, objData, camera,
                        camera->getLastViewport()->getVisibilityMask()&mManager->getVisibilityMask(),
                        *visibleObjects, lodCamera );
        }
        else
        {
            //Get the results from the time the threaded version ran.
            visibleObjects = &mCulledInstances;
        }

        bool useMatrixLookup = useBoneMatrixLookup();
        if (useMatrixLookup)
        {
            //if we are using bone matrix look up we have to update the instance buffer for the 
            //vertex texture to be relevant
            fillVertexBufferLUT( visibleObjects );
        }

        //Now lock the texture and copy the 4x3 matrices!
        size_t floatsPerEntity = mMatricesPerInstance * mRowLength * 4;
        size_t entitiesPerPadding = (size_t)(mMaxFloatsPerLine / floatsPerEntity);

        mMatrixTexture->getBuffer()->lock( HardwareBuffer::HBL_DISCARD );
        const PixelBox &pixelBox = mMatrixTexture->getBuffer()->getCurrentLock();

        float *pDest = static_cast<float*>(pixelBox.data);

        if( mMeshReference->getSkeleton().isNull() )
        {
            //No animations, no anything (perhaps HW Basic is a better technique for this case)
            std::for_each( visibleObjects->begin(), visibleObjects->end(),
                            SendAllSingleTransformsToTexture( pDest, floatsPerEntity,
                                                entitiesPerPadding, mWidthFloatsPadding ) );
        }
        else
        {
            if( !useMatrixLookup && !mUseBoneDualQuaternions )
            {
                // Animations, normal
                std::for_each( visibleObjects->begin(), visibleObjects->end(),
                                SendAllAnimatedTransformsToTexture( pDest, floatsPerEntity,
                                                        entitiesPerPadding, mWidthFloatsPadding,
                                                        mIndexToBoneMap ) );
            }
            else if( mUseBoneDualQuaternions )
            {
                // Animations, Dual Quaternion Skinning
                std::for_each( visibleObjects->begin(), visibleObjects->end(),
                                SendAllDualQuatTexture( pDest, floatsPerEntity,
                                                        entitiesPerPadding,
                                                        mWidthFloatsPadding,
                                                        mIndexToBoneMap ) );
            }
            else
            {
                // Animations, LUT (lookup table)
                std::for_each( visibleObjects->begin(), visibleObjects->end(),
                                SendAllLUTToTexture( pDest, floatsPerEntity,
                                                     entitiesPerPadding, mWidthFloatsPadding,
                                                     mIndexToBoneMap,
                                                     getMaxLookupTableInstances() ) );
            }
        }

        mMatrixTexture->getBuffer()->unlock();

        return visibleObjects->size();
    }